def get_project_key(con: h2o.Client, project_name: str) -> str: """ Returns the key of the project with name matching project_name. If such a project does not exist, a new project is created and its key is returned. :param con: Client to H2O Driverless AI :param project_name: Name of the project :return: """ projects = con.list_projects(offset=0, limit=1000) project = next((x for x in projects if x.name == project_name), None) if project is None: key = con.create_project(project_name, project_name) return key return project.key
def connect(): # Login info dai_url = "http://IPADDRESS:12345" dai_user = "******" dai_pwd = "Password" return Client(dai_url, dai_user, dai_pwd)
def upload_dataset_to_project(con: h2o.Client, project_key: str, dataset_file: str, dataset_type: str): """ Uploads the data provided in dataset_file path to Driverless AI and links to the project. If the project already has a dataset of the specified type and filename linked, then it is not re-uploaded. For the uploaded dataset, the dataset_key of the newly uploaded dataset is returned. If it is not uploaded, then key of the dataset matching the file name is returned. :param con: Connection to H2O Driverless AI :param project_key: Key of the project to link the dataset to :param dataset_file: File path of the dataset to upload and link to project :param dataset_type: Either 'Training' or 'Testing' :return: dataset_key """ file_name = os.path.basename(dataset_file) datasets = con.get_datasets_for_project(project_key, dataset_type) dataset = next((x for x in datasets if x.name == file_name), None) if dataset is None: dataset = con.upload_dataset_sync(file_path=dataset_file) con.link_dataset_to_project(project_key=project_key, dataset_key=dataset.key, dataset_type=dataset_type) return dataset.key
## Set up all of the training experiments import h2oai_client import numpy as np import pandas as pd import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters address = 'http://52.90.67.220:12345' username = '******' password = '******' h2oai = Client(address = address , username = username , password = password) def splitTrainingData(dataPath, basename, target, ratio = 0.8, time=''): data = h2oai.create_dataset_sync(dataPath) # Split the data split_data = h2oai.make_dataset_split( dataset_key = data.key , output_name1 = basename + '_train' , output_name2 = basename + '_test' , target = target , fold_col = '' , time_col = time , ratio = ratio ) # key[0] is train, key[1] is test
def test_debug_pyclient(): from h2oai_client import Client pd.set_option('display.max_rows', 50) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # Login info dai_url = "http://****:12345" dai_user = "******" dai_pwd = "****" # Data Information data_file_name = "****.csv" y = "****" # Transformers information transformer_file_name = "****.py" transformers_noncustom = [] transformers_custom_nontesting = [] # All Offical Transformers transformers_noncustom = ['CVCatNumEncode', 'CVTargetEncode' , 'CatOriginalTransformer', 'ClusterDistTransformer' , 'ClusterIdTransformer', 'ClusterTETransformer', 'DatesTransformer' , 'EwmaLagsTransformer', 'FrequentTransformer', 'InteractionsTransformer' , 'IsHolidayTransformer', 'LagsAggregatesTransformer', 'LagsInteractionTransformer' , 'LagsTransformer', 'LexiLabelEncoder', 'NumCatTETransformer', 'NumToCatTETransformer' , 'NumToCatWoEMonotonicTransformer', 'NumToCatWoETransformer', 'OneHotEncodingTransformer' , 'OriginalTransformer', 'SortedLETransformer', 'StrFeatureTransformer', 'TextClustDistTransformer' , 'TextClustTETransformer', 'TextLinModelTransformer', 'TextTransformer', 'TruncSVDNumTransformer' , 'WeightOfEvidenceTransformer'] # Any Installed Custom Transformers you don't want to test transformers_custom_nontesting = ['MyLogTransformer'] all_nontest_transformers = transformers_noncustom + transformers_custom_nontesting # STEP ZERO: Connect to Driverless AI h2oai = Client(dai_url, dai_user, dai_pwd) # STEP ONE: Load data set (and related tasks) # view all data sets in DAI all_data_sets = h2oai.list_datasets(0, 100) all_data_sets = pd.DataFrame({ 'key': list(map(lambda x: x.key, all_data_sets)) , 'name': list(map(lambda x: x.name, all_data_sets))}) print("PRE-LOADED DATASETS:") print(all_data_sets) # check if data was pre-loaded - if so use that data set - if not load data if data_file_name in all_data_sets['name'].values: print("\nData already loaded ", data_file_name) data_key = all_data_sets[all_data_sets["name"] == data_file_name]["key"][0] data_load_job = h2oai.get_dataset_job(data_key).entity else: print("\nLoading file ", data_file_name) data_load_job = h2oai.upload_dataset_sync(data_file_name) data_key = data_load_job.key # STEP TWO: Load custom transformer (and related tasks) # probably not good to just upload every time # no function to delete from python, only from ssh-ing in # rm tmp/contrib/transformers/[function]_randomletters_content.py print("\nUploading Transformer ", transformer_file_name) my_transformer = h2oai.upload_custom_recipe_sync(transformer_file_name) # returns true or false - exit if fails - check DAI UI for error message if my_transformer: print("\nTransformer uploaded successfully\n") else: print("\nTransformer uploaded failed, exiting program.\n") sys.exit() # STEP THREE: Run experiment (and related tasks) print("\nStarting Experiment\n") experiment = h2oai.start_experiment_sync( dataset_key=data_key , target_col=y , is_classification=True , accuracy=1 , time=1 , interpretability=10 , scorer="F1" , score_f_name=None , config_overrides=""" feature_brain_level=0 exclude_transformers={dont_use} """.format(dont_use=all_nontest_transformers) ) # experiment = h2oai.get_model_job("lomotare").entity # STEP FOUR: Check the transformation was used # Download Summary summary_path = h2oai.download(src_path=experiment.summary_path, dest_dir=".") dir_path = "h2oai_experiment_summary_" + experiment.key import zipfile with zipfile.ZipFile(summary_path, 'r') as z: z.extractall(dir_path) # View Features features = pd.read_table(dir_path + "/features.txt", sep=',', skipinitialspace=True) print(features) # STEP FIVE: Transform data and ensure it looks as expected transform = h2oai.fit_transform_batch_sync(model_key=experiment.key , training_dataset_key=data_key , validation_dataset_key=None , test_dataset_key=None , validation_split_fraction=0.25 , seed=1234 , fold_column=None) # Download the training and validation transformed data transform_train_path = h2oai.download(src_path=transform.training_output_csv_path, dest_dir=".") transform_validate_path = h2oai.download(src_path=transform.validation_output_csv_path, dest_dir=".") transform_train = pd.read_table(transform_train_path, sep=',', skipinitialspace=True) transform_validate = pd.read_table(transform_validate_path, sep=',', skipinitialspace=True) print(transform_train.head()) print(transform_validate.head()) # STEP 1000: Clean up os.remove(summary_path) os.remove(transform_train_path) os.remove(transform_validate_path) shutil.rmtree(dir_path)
# TODO: re-write the already uploaded data check to account for numpy warning of type mismatch import warnings warnings.simplefilter(action='ignore', category=FutureWarning) # Print and Debug Nicely pd.set_option('display.max_rows', 50) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # The following are parameters that need to be set to run these functions # TODO: to redo this is a nicer way # Connect to Driverless AI h2oai = Client('', '', '') # Data Information data_file_name = "" data_file_location = "" + data_file_name y = "" # Transformers Information transformer_name = "" transformer_file_name = "" transformer_file_location = "" + transformer_file_name # Location to Download Files download_file_location = ""
import h2oai_client from h2oai_client import Client h2oai = Client(address='http://129.213.63.69:12345', username='******', password='******') train = h2oai.create_dataset_sync('/train.csv') test = h2oai.create_dataset_sync('/test.csv') experiment = h2oai.start_experiment_sync(dataset_key=train.key, testset_key=test.key, accuracy=10, time=10, interpretability=1, is_classification=True, target_col='LABEL', is_timeseries=True, time_col='DATE', num_gap_periods=1, num_prediction_periods=1) print("Final Model Score on Validation Data: " + str(round(experiment.valid_score, 3))) print("Final Model Score on Test Data: " + str(round(experiment.test_score, 3)))
import h2oai_client import numpy as np import pandas as pd import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters ip = '35.175.227.14' address = 'http://' + ip + ':12345' username = '******' password = '******' h2oai = Client(address=address, username=username, password=password) ### Amaxon Reviews dataPath = '/data/Training/AmazonFineFoodReviews.csv' basename = 'Reviews' target = 'PositiveReview' ratio = 0.8 reviews_data = h2oai.create_dataset_sync(dataPath) # Split the data reviews_split_data = h2oai.make_dataset_split(dataset_key=reviews_data.key, output_name1=basename + "_train", output_name2=basename + "_test", target=target, fold_col="", time_col="", ratio=ratio,
import h2oai_client import numpy as np import pandas as pd import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters ip = '35.175.227.14' address = 'http://' + ip + ':12345' username = '******' password = '******' h2oai = Client(address = address , username = username , password = password) dataPath = '/data/Training/CreditCard.csv' basename = 'Card' target = 'Default' ratio = 0.8 dropped = [] card_data = h2oai.create_dataset_sync(dataPath) # Split the data card_split_data = h2oai.make_dataset_split( dataset_key = card_data.key , output_name1 = basename + "_train" , output_name2 = basename + "_test" , target = target , fold_col = ""
import h2oai_client import numpy as np import pandas as pd import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters ip = '35.175.227.14' address = 'http://' + ip + ':12345' username = '******' password = '******' h2oai = Client(address=address, username=username, password=password) dataPath = '/data/Training/BostonHousing.csv' basename = 'Housing' target = 'VALUE' ratio = 0.8 boston_data = h2oai.create_dataset_sync(dataPath) # Split the data boston_split_data = h2oai.make_dataset_split(dataset_key=boston_data.key, output_name1=basename + "_train", output_name2=basename + "_test", target=target, fold_col="", time_col="", ratio=ratio, seed=1234)
import h2oai_client import numpy as np import pandas as pd import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters ip = '35.175.227.14' address = 'http://' + ip + ':12345' username = '******' password = '******' h2oai = Client(address=address, username=username, password=password) ### Diabetes Models dataPath = '/data/Training/PimaDiabetes.csv' basename = 'Diabetes' target = 'Outcome' ratio = 0.8 diabetes_data = h2oai.create_dataset_sync(dataPath) # Split the data diabetes_split_data = h2oai.make_dataset_split(dataset_key=diabetes_data.key, output_name1=basename + "_train", output_name2=basename + "_test", target=target, fold_col="", time_col="", ratio=ratio,
import h2oai_client import numpy as np import pandas as pd # import h2o import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters address = 'http://18.234.58.12:12345' username = '******' password = '******' h2oai = Client(address=address, username=username, password=password) def splitTrainingData(dataPath, basename, target, ratio=0.8, time=''): data = h2oai.create_dataset_sync(dataPath) # Split the data split_data = h2oai.make_dataset_split(dataset_key=data.key, output_name1=basename + '_train', output_name2=basename + '_test', target=target, fold_col='', time_col=time, ratio=ratio) # key[0] is train, key[1] is test key = h2oai.get_dataset_split_job(split_data).entity return key
import h2oai_client import numpy as np import pandas as pd import requests import math from h2oai_client import Client, ModelParameters, InterpretParameters ip = '35.175.227.14' address = 'http://' + ip + ':12345' username = '******' password = '******' h2oai = Client(address=address, username=username, password=password) ### Titanic Models dataPath = '/data/Training/Titanic.csv' basename = 'Titanic' target = 'survived' ratio = 0.8 titanic_data = h2oai.create_dataset_sync(dataPath) # Split the data titanic_split_data = h2oai.make_dataset_split(dataset_key=titanic_data.key, output_name1=basename + "_train", output_name2=basename + "_test", target=target, fold_col="", time_col="", ratio=ratio, seed=1234)