def get_cleaned_dataset(ws): found = False ds_key = "machine-cpu" description_text = "CPU performance dataset (UCI)." if ds_key in ws.datasets.keys(): found = True ds_cleaned = ws.datasets[ds_key] # Otherwise, create it from the file if not found: with zipfile.ZipFile("./data/machine.zip", "r") as zip_ref: zip_ref.extractall("data") #Reading a json lines file into a DataFrame data = pd.read_csv('./data/machine.csv') # DataFrame with cleaned data cleaned_data = clean_data(data) exported_df = 'cleaned-machine-cpu.parquet' cleaned_data.to_parquet(exported_df) # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once ds_cleaned = TabularDatasetFactory.register_pandas_dataframe( dataframe=cleaned_data, target=(ws.get_default_datastore(), exported_df), name=ds_key, description=description_text, show_progress=True) return ds_cleaned
def infer_forecasting_dataset_tcn(X_test, y_test, model, output_path, output_dataset_name="results"): y_pred, df_all = model.forecast(X_test, y_test) run = Run.get_context() registered_train = TabularDatasetFactory.register_pandas_dataframe( df_all, target=( run.experiment.workspace.get_default_datastore(), datetime.now().strftime("%Y-%m-%d-") + str(uuid.uuid4())[:6], ), name=output_dataset_name, ) df_all.to_csv(os.path.join(output_path, output_dataset_name + ".csv"), index=False)
def create_dataset(ws): kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv') data = pd.read_csv( './data.csv.zip', compression='zip', sep='|' ) # Clean dataset data = clean_data(data) # Register Dataset in Workspace datastore = Datastore(ws) name = "Malware Dataset" description_text = "Malware DataSet for Udacity Capstone Project" dataset = TabularDatasetFactory.register_pandas_dataframe(data, datastore, name, description=description_text) return dataset
def main(): run = Run.get_context() ws = run.experiment.workspace found = False key = "wine-quality" description_text = "Wine Quality Dataset for Udacity Course 3" if key in ws.datasets.keys(): found = True input_data = ws.datasets[key] features = input_data.to_pandas_dataframe() if not found: # Create AML Dataset and register it into Workspace url_white = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" url_red = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" white_data = TabularDatasetFactory.from_delimited_files(url_white, separator=";") red_data = TabularDatasetFactory.from_delimited_files(url_red, separator=";") features, target = clean_data(white_data, red_data) features.loc[:, "quality"] = target ds = ws.get_default_datastore() input_data = TabularDatasetFactory.register_pandas_dataframe( dataframe=features, target=ds, name=key, description=description_text) target = features.pop("quality") target = target.replace({"BAD": -1, "MEDIUM": 0, "GOOD": 1}) x_train, x_test, y_train, y_test = train_test_split(features, target, random_state=0) # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument("--max_depth", type=int, default=6, help="Maximum depth of tree") parser.add_argument("--alpha", type=float, default=0, help="L1 regularization") parser.add_argument("--learning_rate", type=float, default=0.1, help="learning rate") parser.add_argument("--gamma", type=float, default=0.0, help="minimal loss") args = parser.parse_args() run.log("Max Depth:", np.int(args.max_depth)) run.log("Alpha:", np.float(args.alpha)) run.log("Learning rate:", np.float(args.learning_rate)) run.log("Gamma:", np.float(args.gamma)) model = XGBClassifier(booster="gbtree", objective="multi:softmax", subsample=0.8, tree_method="auto", n_estimators=500, max_depth=args.max_depth, reg_alpha=args.alpha, learning_rate=args.learning_rate, gamma=args.gamma) model.fit(x_train, y_train) y_pred = model.predict_proba(x_test) auc = roc_auc_score(y_test, y_pred, average="weighted", multi_class="ovr", labels=model.classes_) os.makedirs("./outputs", exist_ok=True) joblib.dump(model, filename="./outputs/wine-quality-model.pkl") run.log("AUC_weighted", np.float(auc))
import pandas as pd # In the original dataset values and labels were split into separate files # we combine them now again into a single Pandas dataframe original_data = pd.read_csv('data/train_values.csv') original_data_labels = pd.read_csv('data/train_labels.csv') original_data['rate_spread'] = original_data_labels['rate_spread'] print(f"{len(original_data)} total rows") print(original_data.sample(10)) # %% # Upload dataset to Azure uncleaned_dataset_name = "UncleanedMortgageSpread" print(f"Uploading uncleaned dataset to {uncleaned_dataset_name}...") datastore = ws.get_default_datastore() registered_set = TabularDatasetFactory.register_pandas_dataframe(original_data, datastore, uncleaned_dataset_name) print("Done") # %% print("Loading cleaned data sets...") import zipfile import io dataset_zip = zipfile.ZipFile("data/cleanedEngineeredData.zip", "r") engineered_data = pd.read_csv(io.BytesIO(dataset_zip.read("train_cleaned.csv"))) # %% print(f"{len(engineered_data)} total rows") print(engineered_data.columns) print(engineered_data.sample(10))
def register_dataset_to_store(ws, df, name): datastore = Datastore.get_default(ws) TabularDatasetFactory.register_pandas_dataframe(df, datastore, name=name)
def create_DDoS_datasets(ws): dtypes = { 'Src IP': 'category', 'Src Port': 'uint16', 'Dst IP': 'category', 'Dst Port': 'uint16', 'Protocol': 'category', 'Flow Duration': 'uint32', 'Tot Fwd Pkts': 'uint32', 'Tot Bwd Pkts': 'uint32', 'TotLen Fwd Pkts': 'float32', 'TotLen Bwd Pkts': 'float32', 'Fwd Pkt Len Max': 'float32', 'Fwd Pkt Len Min': 'float32', 'Fwd Pkt Len Mean': 'float32', 'Fwd Pkt Len Std': 'float32', 'Bwd Pkt Len Max': 'float32', 'Bwd Pkt Len Min': 'float32', 'Bwd Pkt Len Mean': 'float32', 'Bwd Pkt Len Std': 'float32', 'Flow Byts/s': 'float32', 'Flow Pkts/s': 'float32', 'Flow IAT Mean': 'float32', 'Flow IAT Std': 'float32', 'Flow IAT Max': 'float32', 'Flow IAT Min': 'float32', 'Fwd IAT Tot': 'float32', 'Fwd IAT Mean': 'float32', 'Fwd IAT Std': 'float32', 'Fwd IAT Max': 'float32', 'Fwd IAT Min': 'float32', 'Bwd IAT Tot': 'float32', 'Bwd IAT Mean': 'float32', 'Bwd IAT Std': 'float32', 'Bwd IAT Max': 'float32', 'Bwd IAT Min': 'float32', 'Fwd PSH Flags': 'category', 'Bwd PSH Flags': 'category', 'Fwd URG Flags': 'category', 'Bwd URG Flags': 'category', 'Fwd Header Len': 'uint32', 'Bwd Header Len': 'uint32', 'Fwd Pkts/s': 'float32', 'Bwd Pkts/s': 'float32', 'Pkt Len Min': 'float32', 'Pkt Len Max': 'float32', 'Pkt Len Mean': 'float32', 'Pkt Len Std': 'float32', 'Pkt Len Var': 'float32', 'FIN Flag Cnt': 'category', 'SYN Flag Cnt': 'category', 'RST Flag Cnt': 'category', 'PSH Flag Cnt': 'category', 'ACK Flag Cnt': 'category', 'URG Flag Cnt': 'category', 'CWE Flag Count': 'category', 'ECE Flag Cnt': 'category', 'Down/Up Ratio': 'float32', 'Pkt Size Avg': 'float32', 'Fwd Seg Size Avg': 'float32', 'Bwd Seg Size Avg': 'float32', 'Fwd Byts/b Avg': 'uint32', 'Fwd Pkts/b Avg': 'uint32', 'Fwd Blk Rate Avg': 'uint32', 'Bwd Byts/b Avg': 'uint32', 'Bwd Pkts/b Avg': 'uint32', 'Bwd Blk Rate Avg': 'uint32', 'Subflow Fwd Pkts': 'uint32', 'Subflow Fwd Byts': 'uint32', 'Subflow Bwd Pkts': 'uint32', 'Subflow Bwd Byts': 'uint32', 'Init Fwd Win Byts': 'uint32', 'Init Bwd Win Byts': 'uint32', 'Fwd Act Data Pkts': 'uint32', 'Fwd Seg Size Min': 'uint32', 'Active Mean': 'float32', 'Active Std': 'float32', 'Active Max': 'float32', 'Active Min': 'float32', 'Idle Mean': 'float32', 'Idle Std': 'float32', 'Idle Max': 'float32', 'Idle Min': 'float32', 'Label': 'category' } data = pd.read_csv( './final_dataset.csv', parse_dates=['Timestamp'], usecols=[*dtypes.keys(), 'Timestamp'], engine='c', low_memory=True, na_values=np.inf ) # There are over 12 million rows in this orignal dataset. For this project, that much data is taking far too long, so I'm randomly sampling only .5% of the data data = data.sample(frac=0.005) # Register Base Dataset in Workspace datastore = Datastore(ws) name = "DDoS Dataset" description_text = "DDoS DataSet for Udacity Capstone Project" dataset = TabularDatasetFactory.register_pandas_dataframe(data, datastore, name, description=description_text) # Clean dataset and register the clean version cleaned_data = clean_data(data) clean_dataset_name = "Clean DDoS Dataset" clean_description_text = description_text + " that has been cleaned" clean_dataset = TabularDatasetFactory.register_pandas_dataframe(cleaned_data, datastore, clean_dataset_name, description=clean_description_text)