Beispiel #1
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()
    
    # Fetch the data
    url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = TabularDatasetFactory.from_delimited_files(url)

    # Separate features and target
    x, y = clean_data(ds)

    # Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Setup the run
    run = Run.get_context()
    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    # Setup the model
    model = LogisticRegression(C=args.C, max_iter=args.max_iter,solver='liblinear').fit(x_train, y_train)

    # Log the accuracy
    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
    
    # Save model
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.pkl')
Beispiel #2
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")
    args = parser.parse_args()
    ds = TabularDatasetFactory.from_delimited_files(path=URL)
    x, y = split_data(ds)
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)
    run = Run.get_context()
    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))
    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(X_train, y_train)
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.joblib')
    accuracy = model.score(X_test, y_test)
    run.log("Accuracy", np.float(accuracy))
Beispiel #3
0
def main():
    
    data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = TabularDatasetFactory.from_delimited_files(data_url)

    run = Run.get_context()
    
    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)
    
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
Beispiel #4
0
def main():
    
    url = 'https://raw.githubusercontent.com/AnshuTrivedi/Capstone-Project---Azure-Machine-Learning-Engineer/main/mobile_sales_data.csv'
    data = TabularDatasetFactory.from_delimited_files(url)
    x = data.to_pandas_dataframe()
    y = x.pop("price_range")    
    
    x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=42)

    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")
    parser.add_argument('--solver', type=str, default='lbfgs', help="chose the algorithm to train the model")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))
    run.log("Algorithm: ", args.solver)

    model = LogisticRegression(solver=args.solver, C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model,'outputs/model.joblib')
Beispiel #5
0
def main():
    # Add arguments to the script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()
    
    # Create TabularDataset using TabularDatasetFactory
    # Data is located at:
   
    path_file="https://gist.githubusercontent.com/Nwaneto/0d1477bd10c92f8b16ab19306d21a17f/raw/0af3078c0d174e26039ab31525487ceaceda77b0/parkinson-classification-data.csv

    ds =TabularDatasetFactory.from_delimited_files(path=path_file)

    x, y = clean_data(ds)

    # Split data into train and test sets.

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
Beispiel #6
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

    ds = TabularDatasetFactory.from_delimited_files(path=data_path)

    x, y = get_labels_and_data(ds)

    x_train, x_test, y_train, y_test = train_test_split(x,y)

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    joblib.dump(model, './outputs/model.joblib')

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
def get_cleaned_dataset(ws):
    found = False
    ds_key = "machine-cpu"
    description_text = "CPU performance dataset (UCI)."

    if ds_key in ws.datasets.keys():
        found = True
        ds_cleaned = ws.datasets[ds_key]

    # Otherwise, create it from the file
    if not found:

        with zipfile.ZipFile("./data/machine.zip", "r") as zip_ref:
            zip_ref.extractall("data")

        #Reading a json lines file into a DataFrame
        data = pd.read_csv('./data/machine.csv')
        # DataFrame with cleaned data
        cleaned_data = clean_data(data)
        exported_df = 'cleaned-machine-cpu.parquet'
        cleaned_data.to_parquet(exported_df)
        # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once
        ds_cleaned = TabularDatasetFactory.register_pandas_dataframe(
            dataframe=cleaned_data,
            target=(ws.get_default_datastore(), exported_df),
            name=ds_key,
            description=description_text,
            show_progress=True)
    return ds_cleaned
Beispiel #8
0
def main():

    # TODO: Create TabularDataset using TabularDatasetFactory
    ds = TabularDatasetFactory.from_delimited_files(path = csv_path)

    # Call clean_data to preprocess the dataset
    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.
    x_train, x_test, y_train,y_test = train_test_split(x,y,train_size = 0.8,random_state = 42)

    run = Run.get_context()

    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
Beispiel #9
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default= 1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    ds =TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")
    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.

    x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)
    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
    os.makedirs('outputs', exist_ok=True)
    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(LogisticRegression, 'outputs/model.joblib')
Beispiel #10
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    # TODO: Create TabularDataset using TabularDatasetFactory
    # Data is located at:
    # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ### YOUR CODE HERE ###
    
    ## Added code to inport TabularDataset using TabularDatasetFactory Class
    factory = TabularDatasetFactory()
    train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = factory.from_delimited_files(path=train_data_path)

    # Clean the data
    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.
    ### YOUR CODE HERE ###
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Fit model using Logistic Regression with inpur arguments C -> regularization strength 
    # Importance of Regularization in Logistic regression: 
    # https://stackoverflow.com/questions/22851316/what-is-the-inverse-of-regularization-strength-in-logistic-regression-how-shoul
    # https://www.coursera.org/lecture/machine-learning/regularized-logistic-regression-4BHEy
    
    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    # Download best model using the joblib library
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.joblib')
    
    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
def main():
    # Add arguments to src
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    factory = TabularDatasetFactory()
    ds = factory.from_delimited_files(path)

    X, y = clean_data(ds)

    # Split data into train and test sets.

    train_data, test_data, train_label, test_label = train_test_split(
        X, y, test_size=0.3, random_state=42)

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(
        train_data, train_label)

    accuracy = model.score(test_data, test_label)
    run.log("Accuracy", np.float(accuracy))

    os.makedirs('outputs', exist_ok=True)

    # Save model
    joblib.dump(model, 'outputs/model.joblib')
Beispiel #12
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run = Run.get_context()
    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    # Create TabularDataset using TabularDatasetFactory
    # from web url:
    # "https://raw.githubusercontent.com/atan4583/datasets/master/train.csv"

    wurl = 'https://raw.githubusercontent.com/atan4583/datasets/master/train.csv'
    ds = TabularDatasetFactory.from_delimited_files(wurl)

    x, y = clean_data(ds)
    print(
        f'x null chk: \n{x.isnull().sum()}\n \ny null chk: \n{y.isnull().sum()}\n'
    )
    # Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        stratify=y,
                                                        random_state=42)
    print(
        f'x_train null chk: \n{x_train.isnull().sum()}\n \ny_train null chk: \n{y_train.isnull().sum()}\n'
    )
    print(
        f'x_test null chk: \n{x_test.isnull().sum()}\n \ny_test null chk: \n{y_test.isnull().sum()}\n'
    )

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.pkl')
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help="Number of trees in the forest")
    parser.add_argument(
        '--min_samples_split',
        type=int,
        default=2,
        help="Minimum number of samples required to split an internal node")
    parser.add_argument('--max_features',
                        type=str,
                        default='auto',
                        help="{'auto', 'sqrt', 'log2'}")
    parser.add_argument('--bootstrap',
                        type=bool,
                        default=True,
                        help="Whether bootstrap samples are used or not")

    args = parser.parse_args()

    ds = TabularDatasetFactory.from_delimited_files(path=web_path)

    x, y = split_data(ds)

    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    run = Run.get_context()

    run.log("No of Estimators:", np.int(args.n_estimators))
    run.log("Min No of Samples to Split:", np.int(args.min_samples_split))
    run.log("No of Features Considered:", np.str(args.max_features))
    run.log("Bootstrap:", np.bool(args.bootstrap))

    model = RandomForestClassifier(n_estimators=args.n_estimators,
                                   min_samples_split=args.min_samples_split,
                                   bootstrap=args.bootstrap,
                                   max_features=args.max_features).fit(
                                       X_train, y_train)

    accuracy = model.score(X_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=model, filename='outputs/model.pkl')
Beispiel #14
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()
    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    factory = TabularDatasetFactory()
    train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    valid_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv"

    train_ds = factory.from_delimited_files(train_data_path)
    valid_ds = factory.from_delimited_files(valid_data_path)

    X_train, y_train = clean_data(train_ds)
    X_valid, y_valid = clean_data(valid_ds)

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(X_train, y_train)

    accuracy = model.score(X_valid, y_valid)
    run.log("Accuracy", np.float(accuracy))
    os.makedirs('outputs', exist_ok=True)

    joblib.dump(model, 'outputs/bankmarketing-logit-model.joblib')
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    # 1. Create TabularDataset using TabularDatasetFactory
    # Data is located at:
    # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    # Useful reference:
    # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.tabulardatasetfactory
    raw_data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = TabularDatasetFactory.from_delimited_files(raw_data_url)
    x, y = clean_data(ds)

    # 2. Split data into train and test sets.
    # Useful reference which explains how this works and can guide parameter choice:
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=8, stratify=y)
    # Note: Using the stratify parameter ensures that the split contains the same distribution of target values in the
    # training and test sets as the proportion of values in the entire dataset when the target data for the entire
    # dataset (in this case: y) is passed to the stratify parameter.

    # Useful reference:
    # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.run(class)?view=azure-ml-py
    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    # Note: It may also be worth investigating the 'class_weight' parameter in the LogisticRegression model to deal with
    # the dataset imbalance:
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    # Note: could be worth trying out a different performance metric, e.g. AUC, due to dataset imbalance
    # (88% target outputs: 'no')
    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    # Save the model
    # See these links for useful information:
    # https://knowledge.udacity.com/questions/424266
    # https://www.kaggle.com/pankaj1234/azure-machine-learning-model-training
    # https://towardsdatascience.com/azure-machine-learning-service-train-a-model-df72c6b5dc
    os.makedirs("outputs", exist_ok=True)  # Precautionary, creation should be automatic
    joblib.dump(value=model, filename="./outputs/my_model.joblib")
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--C",
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization",
    )
    parser.add_argument(
        "--max_iter",
        type=int,
        default=100,
        help="Maximum number of iterations to converge",
    )

    args = parser.parse_args()
    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    ds = TabularDatasetFactory().from_delimited_files(
        path=
        "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    )

    x, y = clean_data(ds)

    # Split data into train and test sets.

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=7)

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    pred_prob = model.predict_proba(x_test)
    auc_score = roc_auc_score(y_test, pred_prob[:, 1], average="weighted")
    run.log("AUC", np.float(auc_score))

    # files saved in the "outputs" folder are automatically uploaded into run history
    os.makedirs("outputs", exist_ok=True)
    joblib.dump(model, "./outputs/model.joblib")
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    # Create TabularDataset using TabularDatasetFactory
    # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.tabulardatasetfactory?view=azure-ml-py
    dataset_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = TabularDatasetFactory.from_delimited_files(path=dataset_path)

    x, y = clean_data(ds)

    # Split data into train and test sets.
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=53)

    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter,
                               solver='lbfgs').fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    # save model
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.joblib')
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    # TODO: Create TabularDataset using TabularDatasetFactory
    # Data is located at:
    # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

    ### YOUR CODE HERE ###
    ds = TabularDatasetFactory.from_delimited_files(
        path=
        'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'
    )

    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.
    ### YOUR CODE HERE ###
    x_train, x_test, y_train, y_test = train_test_split(x, y)

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.joblib')
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    # TODO: Create TabularDataset using TabularDatasetFactory
    # Data is located at:
    # "https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv"

    path_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv"
    ds = TabularDatasetFactory.from_delimited_files(path=path_train)
    data = ds.to_pandas_dataframe().dropna()
    y = data['Classification']
    x = data
    x.drop("Classification", inplace=True, axis=1)
    # TODO: Split data into train and test sets.

    ### YOUR CODE HERE ###
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.261)

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    predictions = model.predict(x_test)
    avg_prec_sc = average_precision_score(y_test,
                                          predictions,
                                          average='weighted')
    run.log("average_precision_score_weighted", np.float(avg_prec_sc))
Beispiel #20
0
def main():
    # TODO: Create TabularDataset using TabularDatasetFactory
    # Data is located at:
    # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    datapath = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = TabularDatasetFactory.from_delimited_files(datapath)

    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x, y)

    run = Run.get_context()
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    # Save model
    filename = "bankmarketing_model.pkl"
    output_dir = './outputs/model'
    os.makedirs(output_dir, exist_ok=True)
    full_path = os.path.join(output_dir, filename)
    joblib.dump(value=model, filename=full_path)
    print("model saved in {}".format(full_path))
Beispiel #21
0
def main():

    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv'
    data = TabularDatasetFactory.from_delimited_files(url)
    x = data.to_pandas_dataframe()
    y = x.pop("Classification")

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=200)

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")
    parser.add_argument('--solver',
                        type=str,
                        default='lbfgs',
                        help="chose the algorithm to train the model")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))
    run.log("Algorithm: ", args.solver)

    model = LogisticRegression(solver=args.solver,
                               C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    pred_prob = model.predict_proba(x_test)
    AUC = roc_auc_score(y_test, pred_prob[:, 1])
    run.log("AUC", np.float(AUC))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.joblib')
def main():
    url_path = "https://raw.githubusercontent.com/maulingogri/Azure-Udacity-MLE-ND-Capstone/master/data/heart_failure_clinical_records_dataset.csv"
    ds = TabularDatasetFactory.from_delimited_files(path=url_path)

    # Split data into train and score sets
    # train, score = ds.random_split(percentage=0.75, seed=121)

    x, y = clean_data(ds)

    # TODO: Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=121)

    run = Run.get_context()

    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)
    joblib.dump(model, os.path.join('outputs', 'hd_model.joblib'))

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
Beispiel #23
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    #Create TabularDataset using TabularDatasetFactory
    # Data is located at:
    # https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv

    path_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv"
    ds = TabularDatasetFactory.from_delimited_files(path=path_train)

    x, y = clean_data(ds)

    #Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    predictions = model.predict(x_test)
    avg_prec_sc = average_precision_score(y_test,
                                          predictions,
                                          average='weighted')
    run.log("average_precision_score_weighted", np.float(avg_prec_sc))
Beispiel #24
0
def main():

    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv'
    data = TabularDatasetFactory.from_delimited_files(url)
    x = data.to_pandas_dataframe()
    y = x.pop("DEATH_EVENT")

    x_train, x_test, y_train, y_test = train_test_split(x, y)

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")
    parser.add_argument('--solver',
                        type=str,
                        default='lbfgs',
                        help="chose the algorithm to train the model")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))
    run.log("Algorithm: ", args.solver)

    model = LogisticRegression(solver=args.solver,
                               C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))

    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model, 'outputs/model.joblib')
Beispiel #25
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    web_path = "https://raw.githubusercontent.com/MonishkaDas/nd00333-capstone/master/starter_file/cardio_train.csv"

    ds = TabularDatasetFactory.from_delimited_files(path=web_path,
                                                    separator=";")

    x, y = clean_data(ds)

    #Split data into train and test sets.
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=42,
                                                        shuffle=True)

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
def infer_forecasting_dataset_tcn(X_test,
                                  y_test,
                                  model,
                                  output_path,
                                  output_dataset_name="results"):

    y_pred, df_all = model.forecast(X_test, y_test)

    run = Run.get_context()

    registered_train = TabularDatasetFactory.register_pandas_dataframe(
        df_all,
        target=(
            run.experiment.workspace.get_default_datastore(),
            datetime.now().strftime("%Y-%m-%d-") + str(uuid.uuid4())[:6],
        ),
        name=output_dataset_name,
    )
    df_all.to_csv(os.path.join(output_path, output_dataset_name + ".csv"),
                  index=False)
Beispiel #27
0
def main():
    # Add arguments to the script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    # Create TabularDataset using TabularDatasetFactory
    # Data is located at:

    path_file = "https://raw.githubusercontent.com/hananeouhammouch/Parkinsons-detection/master/parkinsons.data"

    ds = TabularDatasetFactory.from_delimited_files(path=path_file)

    x, y = clean_data(ds)

    # Split data into train and test sets.

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

    run = Run.get_context()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
Beispiel #28
0
def create_dataset(ws):  
    kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv')

    data = pd.read_csv(
            './data.csv.zip',
            compression='zip',
            sep='|'
        )

    # Clean dataset 
    data = clean_data(data)

    # Register Dataset in Workspace
    datastore = Datastore(ws)
    name = "Malware Dataset"
    description_text = "Malware DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    return dataset
def dataset_register_tabular(args):
    """
    Register a tabular dataset into the workspace
    """
    workspace = package_utils.get_workspace()

    datastore_path, target_path = datastore_upload_files(args)

    kwargs = {"path": datastore_path, "set_column_types": DATA_TYPES}
    logger.info(msg="TabularDatasetFactory.from_delimited_files",
                extra={"kwargs": kwargs})
    if not args.dry_run:
        tabular = TabularDatasetFactory.from_delimited_files(**kwargs)

    kwargs = {
        "workspace": workspace,
        "name": target_path,
        "create_new_version": False
    }
    logger.info(msg="tabular.register", extra={"kwargs": kwargs})
    if not args.dry_run:
        _ = tabular.register(**kwargs)
Beispiel #30
0
def main():

    dataset_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
    ds = TabularDatasetFactory.from_delimited_files(dataset_path)

    x, y = clean_data(ds)

    run = Run.get_context()

    # TODO: Split data into train and test sets.
    ### YOUR CODE HERE ###
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=625, shuffle=True)


    # Add arguments to script
    # gets the arguments for regularization strength and max iterations
    parser = argparse.ArgumentParser()

    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")
    parser.add_argument('--solver', type=str, default='lbfgs', help="chose the algorithm to train the model")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(solver=args.solver, C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
    
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(model,'outputs/model.joblib')
    
    if __name__ == '__main__':
    main()