Beispiel #1
0
def get_clean_data():
    ds = TabularDatasetFactory.from_delimited_files(path=_DATA_URL)
    return clean_data(ds)
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.run import Run

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error

import argparse
import os
import numpy as np
import joblib
import pandas as pd

# Imports dataset
mbdataset = TabularDatasetFactory.from_delimited_files("https://raw.githubusercontent.com/czofficial/nd00333-capstone/4a6a4924bdd4c6a188aeb24e0c282bae11c8933b/mercedes.csv")

df = mbdataset.to_pandas_dataframe()

# Cleans dataset
def clean_data(df):
   
    x_df = df
    x_df = pd.get_dummies(df, columns=['model', 'transmission', 'fuelType'])
    y_df = x_df.pop("price")

    return x_df,y_df

x, y = clean_data(df)

# Splits dataset into train and test
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# 

url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(url) 


# TODO: Split data into train and test sets.


### YOUR CODE HERE ###a

run = Run.get_context()

def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
Beispiel #4
0
def retrieve_cleaned_data():
    ds = TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")
    x, y = clean_data(ds)
    return train_test_split(x, y, test_size = 0.3, random_state = 0)
Beispiel #5
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.03,
                        help="Learning rate param for lgbm")
    parser.add_argument('--max_depth',
                        type=int,
                        default=10,
                        help="Limit the tree depth explicitly")
    parser.add_argument('--num_leaves',
                        type=int,
                        default=255,
                        help="Control the complexity of the tree model")
    parser.add_argument('--min_data_in_leaf',
                        type=int,
                        default=3,
                        help="Large value can avoid growing too deep a tree")
    parser.add_argument('--num_iterations',
                        type=int,
                        default=500,
                        help="Number of boosting iterations")

    args = parser.parse_args()
    run.log("learning-rate:", np.float(args.learning_rate))
    run.log("max_depth:", np.int(args.max_depth))
    run.log("num_leaves", np.int(args.num_leaves))
    run.log("min_data_in_leaf", np.int(args.min_data_in_leaf))
    run.log("num_iterations", np.int(args.num_iterations))

    factory = TabularDatasetFactory()
    train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    valid_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv"

    train_ds = factory.from_delimited_files(train_data_path)
    valid_ds = factory.from_delimited_files(valid_data_path)

    X_train, y_train = clean_data(train_ds)
    X_valid, y_valid = clean_data(valid_ds)

    d_train = lgbm.Dataset(X_train, label=y_train)

    lgbm_params = {}

    lgbm_params['learning_rate'] = args.learning_rate
    lgbm_params['boosting_type'] = 'gbdt'
    lgbm_params['objective'] = 'binary'
    lgbm_params['metric'] = 'binary_logloss'
    lgbm_params['max_depth'] = args.max_depth
    lgbm_params['num_leaves'] = args.num_leaves
    lgbm_params['min_data_in_leaf'] = args.min_data_in_leaf
    lgbm_params['colsample_bytree'] = 1.0,

    model = lgbm.train(lgbm_params, d_train, args.num_iterations)

    accuracy = accuracy_score(
        model.predict(X_valid).round(0).astype(int), y_valid)
    run.log("Accuracy", np.float(accuracy))
    os.makedirs('outputs', exist_ok=True)

    joblib.dump(model, 'outputs/bankmarketing-lgbm-model.joblib')
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path)  ### YOUR CODE HERE ###

x, y = clean_data(ds)

# TODO: Split data into train and test sets.

### YOUR CODE HERE ###a
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)
run = Run.get_context()


def clean_data(data):
    # Dict for cleaning data
Beispiel #7
0
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    AUC_weighted = model.score(x_test, y_test)
    run.log("AUC_weighted", np.float(AUC_weighted))


if __name__ == '__main__':

    # importing the dataset for use
    dataset = TabularDatasetFactory.from_delimited_files(
        "https://raw.githubusercontent.com/ujjwalbb30/nd00333-capstone/ujjwalbb30-patch-1/heart_failure_clinical_records_dataset.csv"
    )

    # converting the dataset imported to pandas dataframe
    df = dataset.to_pandas_dataframe()
    x, y = df.drop(columns=['DEATH_EVENT']), df['DEATH_EVENT']
    # TODO: Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    run = Run.get_context()
    main()
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
location = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=location)

run = Run.get_context()


def clean_data(data):
    # Dict for cleaning data
    months = {
        "jan": 1,
        "feb": 2,
        "mar": 3,
        "apr": 4,
        "may": 5,
        "jun": 6,
        "jul": 7,
        "aug": 8,
import argparse
import os
import joblib
import numpy as np
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core.run import Run
from sklearn.model_selection import train_test_split

# Select the data URL
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data'

# create an output folder for the model file
if "outputs" not in os.listdir():
    os.mkdir("./outputs")

ds = TabularDatasetFactory.from_delimited_files(path=data_url, header=False)


def clean_data(data):
    # Transform the TabularDataset into a pandas dataframe, removing NA's
    x_df = data.to_pandas_dataframe().dropna()

    # Rename the columns to it's right labels
    x_df.columns = [
        'id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'
    ]

    # Make the target variable the right type
    x_df.Type = x_df.Type.astype('category')

    # Remove the id column
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
web_path = [
    'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
]

ds = TabularDatasetFactory.from_delimited_files(path=web_path, separator=',')

run = Run.get_context()


def clean_data(data):
    # Dict for cleaning data
    months = {
        "jan": 1,
        "feb": 2,
        "mar": 3,
        "apr": 4,
        "may": 5,
        "jun": 6,
        "jul": 7,
        "aug": 8,
Beispiel #11
0
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# Load data
ds = TabularDatasetFactory.from_delimited_files(
    "https://raw.githubusercontent.com/GowthamiWudaru/heart-Disease-Prediction-With-Azure/main/heartDisease.csv"
)

df = ds.to_pandas_dataframe()
y = df['num']
x = df.drop(['num'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

run = Run.get_context()


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
Beispiel #12
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

from interpret.ext.blackbox import TabularExplainer
from azureml.interpret import ExplanationClient

OUTPUT_DIR = './outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
run = Run.get_context()
client = ExplanationClient.from_run(run)

ds = TabularDatasetFactory.from_delimited_files("https://raw.githubusercontent.com/ketcx/ml-ops-exercise/master/data/hotel_bookings_clean.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False) 

def transform(dataframe):
    
    le = LabelEncoder()  # Label Enconder From Sklearn
    
    # Select all categorcial features
    categorical_features = list(dataframe.columns[dataframe.dtypes == object])    
    
    # Apply Label Encoding on all categorical features
    return dataframe[categorical_features].apply(lambda x: le.fit_transform(x))

def clean_data(data):
    """
    Work with clean dataset.
    """
    #dropping unnecessary columns
    unwantedcolumnlist = ["gender", "MultipleLines", "PaymentMethod", "tenure"]

    df = df2.copy()
    df = df.drop(unwantedcolumnlist, axis=1)

    #Let's convert all the categorical variables into dummy variables
    df_dummies = pd.get_dummies(df)
    y = df_dummies[['Churn']]
    x = df_dummies.drop(columns=['Churn'])

    return x, y


ds = TabularDatasetFactory.from_delimited_files(
    path=
    "https://raw.githubusercontent.com/arfa-t/ML-Nanodegree/main/Capstone%20Project/churn.csv"
)

x, y = clean_data(ds)
#train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.30,
                                                    shuffle=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
import argparse
import joblib
Beispiel #14
0
import os
import numpy as np
from sklearn.metrics import auc, roc_curve
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

path = "https://www.kaggle.com/andrewmvd/heart-failure-clinical-data/download"
#this
ds = TabularDatasetFactory.from_delimited_files(path,
                                                validate=True,
                                                include_path=False,
                                                infer_column_types=True,
                                                set_column_types=None,
                                                separator=',',
                                                header=True,
                                                partition_format=None)

df = ds.to_pandas()
X, y = df.loc[:, ~df.columns.isin(["DEATH_EVENT"])], df.loc[:, "DEATH_EVENT"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=123)
run = Run.get_context()


def main():
    parser = argparse.ArgumentParser()
Beispiel #15
0
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1
                                           if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)


path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path,
                                                validate=True,
                                                include_path=False,
                                                infer_column_types=True,
                                                set_column_types=None,
                                                separator=',',
                                                header=True,
                                                partition_format=None,
                                                support_multi_line=False,
                                                empty_as_string=False,
                                                encoding='utf8')

x, y = clean_data(ds)

# TODO: Split data into train and test sets.

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

run = Run.get_context()

Beispiel #16
0
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv",validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

path_to_data = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=path_to_data)

run = Run.get_context()

def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
Beispiel #18
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Azure imports e.g., Workspace,...
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# VRK: Create TabularDataset using TabularDatasetFactory

# create nsl-kdd network train data
nsl_kdd_webpath = [
                      'https://raw.githubusercontent.com/venkataravikumaralladi/AzureMLCapstoneProject/main/input/KDDTrain.csv'
                  ]

#create network analysis data set in tabular format using TabularDatasetFactory
nsl_kdd_dataset = TabularDatasetFactory.from_delimited_files(path=nsl_kdd_webpath)

class NSLKDDFeatureAnalysis:
   # class variables
   network_data_column_names = [ 
                  'duration', 'protocol_type', 'service',
                  'flag', 'src_bytes', 'dst_bytes',
                  'land', 'wrong_fragment', 'urgent',
    
            
                  'hot', 'num_failed_logins', 'logged_in',
                  'num_compromised', 'root_shell', 'su_attempted',
                  'num_root', 'num_file_creations', 'num_shells',
                  'num_access_files', 'num_outbound_cmds', 'is_hot_login',
                  'is_guest_login',
    
Beispiel #19
0
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

data_loc = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00537/sobar-72.csv'
ds = TabularDatasetFactory.from_delimited_files(data_loc)

#Save model for current iteration

run = Run.get_context()

x_df = ds.to_pandas_dataframe().dropna()

y_df = x_df.pop('ca_cervix')

# TODO: Split data into train and test sets.
x_train, x_test, y_train, y_test = train_test_split(x_df,
                                                    y_df,
                                                    test_size=0.2,
                                                    random_state=0)


def main():
Beispiel #20
0
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

web_path = "https://raw.githubusercontent.com/ciccioska/Udacity-AzureML-Capstone-Project/master/healthcare-dataset-stroke-data.csv"
ds = TabularDatasetFactory.from_delimited_files(web_path)

x_df = ds.to_pandas_dataframe().dropna()

x_df["work_type"] = x_df.work_type.apply(lambda s: 1 if s == "Private" else 0
                                         if s == "Self-employed" else -1)
x_df["bmi"] = x_df.bmi.apply(lambda s: 0 if s == "N/A" else s)
x_df["gender"] = x_df.gender.apply(lambda s: 1 if s == "Male" else 0)
x_df["Residence_type"] = x_df.Residence_type.apply(lambda s: 1
                                                   if s == "Urban" else 0)
x_df["smoking_status"] = x_df.smoking_status.apply(
    lambda s: 1 if s == "smokes" else -1 if s == "never smoked" else 2
    if s == "formerly smoked" else 0)

print(x_df)

y_df = x_df.pop("stroke")
Beispiel #21
0
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory
import joblib


run = Run.get_context()

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv"

ds = TabularDatasetFactory.from_delimited_files("https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv")


def clean_data(data):

    x_df = data.to_pandas_dataframe()

    y_df = x_df.pop("DEATH_EVENT")

    return (x_df, y_df)


x, y = clean_data(ds)

# TODO: Split data into train and test sets.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
Beispiel #22
0
    def _create_tabular(self, parameters, validate):
        source_type = self._json_utility.try_get_value(
            parameters, self._prop_source_type, None,
            lambda v: v in self._valid_source_types,
            'Property "{}" must be one of {}.'.format(
                self._prop_source_type, self._valid_source_types))

        if source_type == 'sql_query':
            query = self._get_query(parameters)
            if is_dataprep_installed():
                return TabularDatasetFactory.from_sql_query(query, validate)
            return self._create_dataset_from_blocks(
                [_Block.craft_read_sql_block(query)], TabularDataset)

        path = self._get_path(parameters)
        include_path = self._json_utility.try_get_value(
            parameters, self._prop_include_path, self._default_include_path)
        partition_format = self._json_utility.try_get_value(
            parameters, self._prop_partition_format, None)

        if source_type == 'parquet_files':
            if is_dataprep_installed():
                return TabularDatasetFactory.from_parquet_files(
                    path,
                    validate,
                    include_path,
                    partition_format=partition_format)
            return self._create_dataset_from_blocks([
                _Block.craft_get_file_block(path),
                _Block.craft_read_parquet_block(),
                _Block.craft_partition_format_block(partition_format)
                if partition_format else None,
                _Block.craft_drop_path_column_block()
                if not include_path else None
            ], TabularDataset)

        if source_type == 'json_lines_files':
            if is_dataprep_installed():
                return TabularDatasetFactory.from_json_lines_files(
                    path,
                    validate,
                    include_path,
                    partition_format=partition_format)
            return self._create_dataset_from_blocks([
                _Block.craft_get_file_block(path),
                _Block.craft_read_json_lines_block(),
                _Block.craft_partition_format_block(partition_format)
                if partition_format else None,
                _Block.craft_drop_path_column_block()
                if not include_path else None
            ], TabularDataset)

        if source_type == 'delimited_files':
            infer_column_types = self._json_utility.try_get_value(
                parameters, self._prop_infer_column_types,
                self._default_infer_column_types)
            separator = self._json_utility.try_get_value(
                parameters, self._prop_separator, self._default_separator)
            header = self._json_utility.try_get_value(parameters,
                                                      self._prop_header,
                                                      self._default_header)
            if is_dataprep_installed():
                return TabularDatasetFactory.from_delimited_files(
                    path,
                    validate,
                    include_path=include_path,
                    partition_format=partition_format,
                    infer_column_types=infer_column_types,
                    separator=separator,
                    header=header)
            if infer_column_types:
                _raise_dataprep_missing_error(
                    'Cannot infer column types',
                    self._error_utility.get_error_message(
                        'setting {} to false'.format(
                            self._prop_infer_column_types)))
            return self._create_dataset_from_blocks([
                _Block.craft_get_file_block(path),
                _Block.craft_read_delimited_block(separator, header),
                _Block.craft_partition_format_block(partition_format)
                if partition_format else None,
                _Block.craft_drop_path_column_block()
                if not include_path else None
            ], TabularDataset)

        raise RuntimeError('Unexpected code path for source_type: ' +
                           source_type)
Beispiel #23
0
    # Remove any incomplete entries from the dataset.
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    x_df = df[indices_to_keep]

    y_df = x_df.pop("result")

    x_df.reset_index()

    return x_df, y_df


# Import player data CSV
ds = TabularDatasetFactory.from_delimited_files(
    path=
    "https://oracleselixir-downloadable-match-data.s3-us-west-2.amazonaws.com/2020_LoL_esports_match_data_from_OraclesElixir_20210126.csv"
)

player_data = ds.to_pandas_dataframe()

# Process dataframe
x, y = clean_data(player_data)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

run = Run.get_context()


def main():
    # Add arguments
    parser = argparse.ArgumentParser()
Beispiel #24
0
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return x_df, y_df

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
data_path= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds =TabularDatasetFactory.from_delimited_files(path=data_path, separator=",")

X, y = clean_data(ds)

# TODO: Split data into train and test sets.

### YOUR CODE HERE ###

X_train, X_test,y_train,y_test= train_test_split(X, y, test_size=0.33, random_state=42)

run = Run.get_context()


    

def main():
Beispiel #25
0
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

dataset_path = "https://raw.githubusercontent.com/fati-ma/nd00333-capstone/master/heart_failure_clinical_records_dataset%5B1%5D.csv"

ds = TabularDatasetFactory.from_delimited_files(path=dataset_path)

x_df = ds.to_pandas_dataframe().dropna()

run = Run.get_context()

y_df = x_df.pop("DEATH_EVENT")

x_train, x_test, y_train, y_test = train_test_split(x_df,
                                                    y_df,
                                                    test_size=0.3,
                                                    random_state=123)


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
import os
import numpy as np
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
DATA_URL = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(DATA_URL)


def clean_data(data):
    # Dict for cleaning data
    months = {
        "jan": 1,
        "feb": 2,
        "mar": 3,
        "apr": 4,
        "may": 5,
        "jun": 6,
        "jul": 7,
        "aug": 8,
        "sep": 9,
        "oct": 10,
Beispiel #27
0
from sklearn.linear_model import LogisticRegression
import argparse
import os
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

url = "https://raw.githubusercontent.com/benbot8/azure_capstone/f54d4da8ea6c2cd5c9082b6703271c25d5f16b40/starter_file/train.csv"

df = TabularDatasetFactory.from_delimited_files(path=url)

#df = pd.read_csv("train.csv")


def clean_data(data):
    #clean data and convert categorical to indicator variables
    x_df = data.to_pandas_dataframe().dropna()
    x_df.reset_index(drop=True, inplace=True)
    x_df.drop(['state', 'account_length', 'area_code'], axis=1, inplace=True)
    x_df['international_plan'] = x_df.international_plan.apply(
        lambda s: 1 if s == "yes" else 0)
    x_df['voice_mail_plan'] = x_df.voice_mail_plan.apply(lambda s: 1
                                                         if s == "yes" else 0)
    x_df['churn'] = x_df.churn.apply(lambda s: 1 if s == "yes" else 0)
    x_df.rename(columns={"churn": "y"}, inplace=True)
    y_df = x_df.pop("y")
    return x_df, y_df
# Azure imports e.g., Workspace,...
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory

# VRK: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

# create bank marketing tabular dataset from public azure ml.
bankmarketing_webpath = [
    'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
]

#create bankmarketing data set in tabular format using TabularDatasetFactory
bankmarketing_dataset = TabularDatasetFactory.from_delimited_files(
    path=bankmarketing_webpath)


def clean_data(data):
    # Dict for cleaning data
    months = {
        "jan": 1,
        "feb": 2,
        "mar": 3,
        "apr": 4,
        "may": 5,
        "jun": 6,
        "jul": 7,
        "aug": 8,
        "sep": 9,
        "oct": 10,
Beispiel #29
0
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1
                                           if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return (x_df, y_df
            )  #adding missed return (df with both features and targets)


# TODO: Create TabularDataset using TabularDatasetFactory
# Data is located at:
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

#creating a tabularDataset
ds = TabularDatasetFactory.from_delimited_files(
    'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
)  ### YOUR CODE HERE ###
#cleaning data
x, y = clean_data(ds)

# TODO: Split data into train and test sets.
#spliting data --> 80-20
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

run = Run.get_context()


def main():
Beispiel #30
0
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
import argparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from azureml.data.dataset_factory import TabularDatasetFactory

#-----------------------------------------------------------------------------
#Import data (out of Azure):
#data = pp.load_data('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')

#Import data (Within Azure):
data = TabularDatasetFactory.from_delimited_files(
    'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
dataset = data.to_pandas_dataframe()


#-----------------------------------------------------------------------------
#Clean Data
#Define clean data function
def clean_data(df):

    df = pp.replace_question_marks(df)

    df['cabin'] = df['cabin'].apply(pp.get_first_cabin)

    df['title'] = df['name'].apply(pp.get_title)

    # cast numerical variables as floats