def _create_tmp_folder(self, logger): # Create a temp folder to store xnn files # Set the default value without context available (required to pass acceptance test) tmp_folder = os.path.join(user_dir(), "%s_xnn_model_folder" % uuid.uuid4()) # Make a real tmp folder when experiment is available if self.context and self.context.experiment_id: tmp_folder = os.path.join(self.context.experiment_tmp_dir, "%s_xnn_model_folder" % uuid.uuid4()) # Now let's try to create that folder try: os.mkdir(tmp_folder) except PermissionError: # This not occur so log a warning loggerwarning(logger, "XNN was denied temp folder creation rights") tmp_folder = os.path.join(user_dir(), "%s_xnn_model_folder" % uuid.uuid4()) os.mkdir(tmp_folder) except FileExistsError: # We should never be here since temp dir name is expected to be unique loggerwarning(logger, "XNN temp folder already exists") tmp_folder = os.path.join(self.context.experiment_tmp_dir, "%s_xnn_model_folder" % uuid.uuid4()) os.mkdir(tmp_folder) except: # Revert to temporary file path tmp_folder = os.path.join(user_dir(), "%s_xnn_model_folder" % uuid.uuid4()) os.mkdir(tmp_folder) loggerinfo(logger, "XNN temp folder {}".format(tmp_folder)) return tmp_folder
def set_tagger(self): import nltk nltk_data_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "nltk_data") nltk_temp_path = os.path.join(user_dir(), "nltk_data") nltk.data.path.append(nltk_data_path) nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path) try: self.pos_tagger = nltk.pos_tag self.pos_tagger("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tagger_path = os.path.join(nltk_data_path, "taggers") os.makedirs(tagger_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip", dest_path=nltk_temp_path) file2 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tagger_path) self.unzip_file(file2, tagger_path) self.atomic_copy(file1, tagger_path) self.atomic_copy(file2, tagger_path) self.pos_tagger = nltk.pos_tag self.pos_tagger("test")
def preprocess_image(self, source_img_path, check_only=False): try: final_img_path = os.path.join(user_dir(), self.uuid, os.path.basename(source_img_path)) except: # we are sometimes getting np.float32, why? return None delete = False if not os.path.exists(final_img_path): if not os.path.exists(source_img_path): try: self.download(source_img_path, final_img_path) except requests.RequestException as e: # print_debug("Error: %s for source_img_path: %s" % (str(e), str(source_img_path))) return None delete = False # True to avoid re-download or a race condition between multiple procs else: final_img_path = source_img_path if not check_only: import h2oaicore.keras as keras importlib.reload(keras) img = keras.preprocessing.image.load_img(final_img_path, target_size=(224, 224)) if delete: remove(final_img_path) x = keras.preprocessing.image.img_to_array(img) x = np.expand_dims(x, axis=0) x = keras.applications.resnet50.preprocess_input(x) return x else: return True
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) orig_cols = list(X.names) XX = X.to_pandas() params = { 'train_dir': user_dir(), 'allow_writing_files': False, 'thread_count': 10, # 'loss_function': 'Logloss' } from catboost import CatBoostClassifier model = CatBoostClassifier(**params) model.fit(XX, y=y, sample_weight=sample_weight, verbose=False, cat_features=list(X[:, [str, int]].names) ) # Amazon specific, also no early stopping # must always set best_iterations self.set_model_properties(model=model, features=orig_cols, importances=model.feature_importances_, iterations=0)
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) X = self.inf_impute(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(user_dir(), self.id) model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin") os.makedirs(model_path, exist_ok=True) with open(model_file, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_file)) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: if kwargs.get("pred_contribs"): return model.predict_contributions(test_frame).as_data_frame( header=False).values preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) if self.num_classes == 1: return preds.values.ravel() elif self.num_classes == 2: return preds.iloc[:, -1].values.ravel() else: return preds.iloc[:, 1:].values finally: # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model h2o.remove(test_frame) remove(model_file) if preds_frame is not None: h2o.remove(preds_frame)
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "airlines") os.makedirs(temp_path, exist_ok=True) link = AirlinesData.base_url + "1990.csv.bz2" file = download(link, dest_path=temp_path) output_file1 = file.replace(".bz2", "") print("%s %s" % (file, output_file1)) extract_bz2(file, output_file1) link = AirlinesData.base_url + "1991.csv.bz2" file = download(link, dest_path=temp_path) output_file2 = file.replace(".bz2", "") print("%s %s" % (file, output_file2)) extract_bz2(file, output_file2) return [output_file1, output_file2]
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(user_dir(), self.id) with open(model_path, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: if kwargs.get("pred_contribs"): return model.predict_contributions(test_frame).as_data_frame( header=False).values preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) if self.num_classes == 1: return preds.values.ravel() elif self.num_classes == 2: return preds.iloc[:, -1].values.ravel() else: return preds.iloc[:, 1:].values finally: h2o.remove(self.id) h2o.remove(test_frame) if preds_frame is not None: h2o.remove(preds_frame)
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # Download files # Location in DAI file system where we will save the data set temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz" link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz" link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz" # Download the files file_basics = download(link_basics, dest_path=temp_path) file_ratings = download(link_ratings, dest_path=temp_path) file_episodes = download(link_episodes, dest_path=temp_path) # get COVID19 new cases data from Our World in Data github basics = dt.fread(file_basics, fill=True) ratings = dt.fread(file_ratings, fill=True) episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True) # remove files os.remove(file_basics) os.remove(file_ratings) os.remove(file_episodes) # Create Title with Ratings dataset # join titles with non-null ratings ratings = ratings[~dt.isna(dt.f.averageRating), :] ratings.key = "tconst" basics_ratings = basics[:, :, dt.join(ratings)] # Create Episodes dataset episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :] episode_ratings = episodes[:, :, dt.join(ratings)] episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'} basics_ratings.key = 'tconst' title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)] # enumerate series episodes from 1 to N title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)] result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list() from itertools import chain cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0]) title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount)) # return datasets return {f"imdb_title_ratings": basics_ratings, f"imdb_episode_ratings": title_episode_ratings}
def __init__(self, **kwargs): super().__init__(**kwargs) self.id = None self.target = "__target__" self.weight = "__weight__" self.col_types = None self.my_log_dir = os.path.abspath(os.path.join(user_dir(), config.contrib_relative_directory, "h2o_log")) if not os.path.isdir(self.my_log_dir): os.makedirs(self.my_log_dir, exist_ok=True)
def create_data(X: dt.Frame = None) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4())) os.makedirs(temp_path, exist_ok=True) link = TestData.url file = download(link, dest_path=temp_path) return file
def __init__(self, batch_size=32, **kwargs): super().__init__(**kwargs) self.batch_size = batch_size self.model_name = "resnet_keras.h5p" self.uuid = "%s-img-data-" % self.__class__.__name__ + self.model_name # + str(uuid.uuid4())[:6] # no, keeps changing and re-loadeing every init self.uuid_tmp = str(uuid.uuid4())[:6] self.col_name = self.input_feature_names[0] self.model_path = os.path.join(user_dir(), self.uuid + ".model") self.model_tmp_path = self.model_path + "_" + self.uuid_tmp + ".tmp" if not os.path.exists(self.model_path): self.download( url="http://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/recipes/transformers/img/%s" % self.model_name, dest=self.model_path) with open(self.model_path, 'rb') as f: self.model_bytes = f.read()
def transform(self, X: dt.Frame): h2o.init(port=config.h2o_recipes_port) model_path = os.path.join(user_dir(), self.id) with open(model_path, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) frame = h2o.H2OFrame(X.to_pandas()) anomaly_frame = None try: anomaly_frame = model.anomaly(frame) anomaly_frame_df = anomaly_frame.as_data_frame(header=False) return anomaly_frame_df finally: h2o.remove(self.id) h2o.remove(anomaly_frame)
def create_data(X: dt.Frame = None) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4())) os.makedirs(temp_path, exist_ok=True) link = "http://data.un.org/_Docs/SYB/CSV/SYB63_226_202009_Net%20Disbursements%20from%20Official%20ODA%20to%20Recipients.csv" output_file1 = download(link, dest_path=temp_path) link = "http://data.un.org/_Docs/SYB/CSV/SYB63_223_202009_Net%20Disbursements%20from%20Official%20ODA%20from%20Donors.csv" output_file2 = download(link, dest_path=temp_path) return [output_file1, output_file2]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: # exit gracefully if method is called as a data upload rather than data modify if X is None: return [] import os from h2oaicore.systemutils import config # Change to pandas -> we can rewrite this as dt at a later date rain_raw = X.to_pandas() # Set index and pivot the data # Rows go from one row each month to one row each month & gauge rain_raw = rain_raw.set_index("date") rain_pivot = rain_raw.unstack().reset_index(name="rain_inches") rain_pivot.rename(columns={ 'level_0': 'rain_gauge', 'date': 'end_of_month' }, inplace=True) # Format date appropriately rain_pivot['end_of_month'] = pd.to_datetime(rain_pivot['end_of_month']) # Split data into train and test by date # Train on 7 years of data, test on 1 year of data train_py = rain_pivot[(rain_pivot['end_of_month'] >= '2009-01-01') & (rain_pivot['end_of_month'] <= '2016-01-01')] test_py = rain_pivot[rain_pivot['end_of_month'].dt.year == 2016] # Set up to save to disk temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # Save files to disk file_train = os.path.join(temp_path, "seattle_rain_train.csv") train_py.to_csv(file_train) file_test = os.path.join(temp_path, "seattle_rain_test.csv") test_py.to_csv(file_test) return [file_train, file_test]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4())) os.makedirs(temp_path, exist_ok=True) link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" output_file1 = download(link, dest_path=temp_path) link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data" output_file2 = download(link, dest_path=temp_path) return [output_file1, output_file2]
def transform(self, X: dt.Frame): h2o.init(port=config.h2o_recipes_port) model_path = os.path.join(user_dir(), self.id) model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin") os.makedirs(model_path, exist_ok=True) with open(model_file, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(os.path.abspath(model_file)) frame = h2o.H2OFrame(X.to_pandas()) anomaly_frame = None try: anomaly_frame = model.anomaly(frame) anomaly_frame_df = anomaly_frame.as_data_frame(header=False) return anomaly_frame_df finally: remove(model_file) h2o.remove(self.id) h2o.remove(anomaly_frame)
def fit_transform(self, X: dt.Frame, y: np.array = None): h2o.init(port=config.h2o_recipes_port) model = H2OAutoEncoderEstimator(activation='tanh', epochs=1, hidden=[50, 50], reproducible=True, seed=1234) frame = h2o.H2OFrame(X.to_pandas()) model_path = None try: model.train(x=list(range(X.ncols)), training_frame=frame) self.id = model.model_id model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() return model.anomaly(frame).as_data_frame(header=False) finally: if model_path is not None: remove(model_path) h2o.remove(model)
def predict(self, X, **kwargs): model, _, _, _ = self.get_model_properties() X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = os.path.join(user_dir(), self.id) with open(model_path, "wb") as f: f.write(model) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types) preds_frame = None try: preds_frame = model.predict(test_frame) preds = preds_frame.as_data_frame(header=False) return preds.values.ravel() finally: h2o.remove(self.id) h2o.remove(test_frame) if preds_frame is not None: h2o.remove(preds_frame)
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config # Location in DAI file system where we will save the data set temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link = "https://data.seattle.gov/resource/rdtp-hzy3.csv" # Download the file file = download(link, dest_path=temp_path) # Give the file a descriptive name for the UI output_file = file.replace("rdtp-hzy3", "seattle_monthly_rain_raw") os.rename(file, output_file) # Return the location on the DAI server for this data set return output_file
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines") os.makedirs(temp_path, exist_ok=True) dt.options.nthreads = 8 # specify which years are used for training and testing training = list(range(2005, 2008)) testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = AirlinesData.base_url + "%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") if not os.path.exists(output_file): extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins) flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # select flights leaving from SFO only X = X[dt.f['Origin'] == 'SFO', :] # Fill NaNs in DepDelay column X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 # create binary target column depdelay_threshold_mins = 15 target = 'DepDelay%dm' % depdelay_threshold_mins X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins cols_to_keep.extend([ target, 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download( 'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = True if not split: filename = os.path.join( temp_path, "flight_delays_data_recipe_%d-%d.csv" % (min(training), max(testing))) X.to_csv(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join( temp_path, "augmented_flights_%s-%d_%s.csv" % (X_split[:, 'Year'].min1(), X_split[:, 'Year'].max1(), name)) X_split.to_csv(filename) output_files.append(filename) return output_files
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) X = self.inf_impute(X) self.transcribe(X=X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = None if isinstance(self, H2ONBModel): # NB can only handle weights of 0 / 1 if sample_weight is not None: sample_weight = (sample_weight != 0).astype(int) if sample_weight_eval_set is not None and len( sample_weight_eval_set ) > 0 and sample_weight_eval_set[0] is not None: sample_weight_eval_set1 = sample_weight_eval_set[0] sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1 sample_weight_eval_set1 = sample_weight_eval_set1.astype(int) sample_weight_eval_set = [sample_weight_eval_set1] train_X = h2o.H2OFrame(X.to_pandas()) self.col_types = train_X.types train_y = h2o.H2OFrame( y, column_names=[self.target], column_types=[ 'categorical' if self.num_classes >= 2 else 'numeric' ]) train_frame = train_X.cbind(train_y) if sample_weight is not None: train_w = h2o.H2OFrame(sample_weight, column_names=[self.weight], column_types=['numeric']) train_frame = train_frame.cbind(train_w) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types) valid_y = h2o.H2OFrame( eval_set[0][1], column_names=[self.target], column_types=[ 'categorical' if self.num_classes >= 2 else 'numeric' ]) valid_frame = valid_X.cbind(valid_y) if sample_weight is not None: if sample_weight_eval_set is None: sample_weight_eval_set = [np.ones(len(eval_set[0][1]))] valid_w = h2o.H2OFrame(sample_weight_eval_set[0], column_names=[self.weight], column_types=['numeric']) valid_frame = valid_frame.cbind(valid_w) try: train_kwargs = dict() params = copy.deepcopy(self.params) if not isinstance(self, H2OAutoMLModel): # AutoML needs max_runtime_secs in initializer, all others in train() method max_runtime_secs = params.pop('max_runtime_secs', 0) train_kwargs = dict(max_runtime_secs=max_runtime_secs) if valid_frame is not None: train_kwargs['validation_frame'] = valid_frame if sample_weight is not None: train_kwargs['weights_column'] = self.weight # Don't ever use the offset column as a feature offset_col = None # if no column is called offset we will pass "None" and not use this feature cols_to_train = [] # list of all non-offset columns for col in list(train_X.names): if not col.lower() == "offset": cols_to_train.append(col) else: offset_col = col orig_cols = cols_to_train # not training on offset trials = 2 for trial in range(0, trials): try: # Models that can use an offset column model = self.make_instance(**params) if isinstance(model, H2OGBMModel) | isinstance( model, H2ODLModel) | isinstance( model, H2OGLMModel): model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col, **train_kwargs) else: model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs) break except Exception as e: print(str(e)) t, v, tb = sys.exc_info() ex = ''.join(traceback.format_exception(t, v, tb)) if 'Training data must have at least 2 features' in str( ex) and X.ncols != 0: # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case raise IgnoreEntirelyError elif "min_rows: The dataset size is too small to split for min_rows" in str( e): # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows params['min_rows'] = 1 # go down to lowest value # permit another trial else: raise if trial == trials - 1: # if at end of trials, raise no matter what raise if isinstance(model, H2OAutoML): model = model.leader self.id = model.model_id model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: remove(model_path) for xx in [ train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y ]: if xx is not None: if isinstance(xx, H2OAutoML): h2o.remove(xx.project_name) else: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: df_varimp.index = df_varimp['variable'] df_varimp = df_varimp.iloc[:, 1] # relative importance for missing in [ x for x in orig_cols if x not in list(df_varimp.index) ]: # h2o3 doesn't handle raw strings all the time, can hit: # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]" df_varimp[missing] = 0 varimp = df_varimp[orig_cols].values # order by fitted features varimp = np.nan_to_num(varimp) self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = None if isinstance(self, H2ONBModel): # NB can only handle weights of 0 / 1 if sample_weight is not None: sample_weight = (sample_weight != 0).astype(int) if sample_weight_eval_set is not None: sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)] train_X = h2o.H2OFrame(X.to_pandas()) self.col_types = train_X.types train_y = h2o.H2OFrame(y, column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) train_frame = train_X.cbind(train_y) if sample_weight is not None: train_w = h2o.H2OFrame(sample_weight, column_names=[self.weight], column_types=['numeric']) train_frame = train_frame.cbind(train_w) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types) valid_y = h2o.H2OFrame(eval_set[0][1], column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) valid_frame = valid_X.cbind(valid_y) if sample_weight is not None: if sample_weight_eval_set is None: sample_weight_eval_set = [np.ones(len(eval_set[0][1]))] valid_w = h2o.H2OFrame(sample_weight_eval_set[0], column_names=[self.weight], column_types=['numeric']) valid_frame = valid_frame.cbind(valid_w) try: train_kwargs = dict() params = copy.deepcopy(self.params) if not isinstance(self, H2OAutoMLModel): # AutoML needs max_runtime_secs in initializer, all others in train() method max_runtime_secs = params.pop('max_runtime_secs') train_kwargs = dict(max_runtime_secs=max_runtime_secs) if valid_frame is not None: train_kwargs['validation_frame'] = valid_frame if sample_weight is not None: train_kwargs['weights_column'] = self.weight model = self.make_instance(**params) # Don't ever use the offset column as a feature offset_col = None # if no column is called offset we will pass "None" and not use this feature cols_to_train = [] # list of all non-offset columns for col in list(train_X.names): if not col.lower() == "offset": cols_to_train.append(col) else: offset_col = col orig_cols = cols_to_train # not training on offset # Models that can use an offset column if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel): model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col, **train_kwargs) else: model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs) if isinstance(model, H2OAutoML): model = model.leader self.id = model.model_id model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: remove(model_path) for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]: if xx is not None: if isinstance(xx, H2OAutoML): h2o.remove(xx.project_name) else: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: df_varimp.index = df_varimp['variable'] df_varimp = df_varimp.iloc[:, 1] # relative importance for missing in [x for x in orig_cols if x not in list(df_varimp.index)]: # h2o3 doesn't handle raw strings all the time, can hit: # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]" df_varimp[missing] = 0 varimp = df_varimp[orig_cols].values # order by fitted features varimp = np.nan_to_num(varimp) self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = None orig_cols = list(X.names) train_X = h2o.H2OFrame(X.to_pandas()) self.col_types = train_X.types train_y = h2o.H2OFrame(y, column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) train_frame = train_X.cbind(train_y) if sample_weight is not None: train_w = h2o.H2OFrame(sample_weight, column_names=[self.weight], column_types=['numeric']) train_frame = train_frame.cbind(train_w) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types) valid_y = h2o.H2OFrame(eval_set[0][1], column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) valid_frame = valid_X.cbind(valid_y) if sample_weight is not None: if sample_weight_eval_set is None: sample_weight_eval_set = [np.ones(len(eval_set[0][1]))] valid_w = h2o.H2OFrame(sample_weight_eval_set[0], column_names=[self.weight], column_types=['numeric']) valid_frame = valid_frame.cbind(valid_w) try: train_kwargs = dict() max_runtime_secs = self.params.get('max_runtime_secs', 0) train_kwargs = dict(max_runtime_secs=max_runtime_secs) if valid_frame is not None: train_kwargs['validation_frame'] = valid_frame if sample_weight is not None: train_kwargs['weights_column'] = self.weight model = self.make_instance(**self.params) model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs) self.id = model.model_id model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: remove(model_path) for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]: if xx is not None: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: df_varimp.index = df_varimp['variable'] df_varimp = df_varimp.iloc[:, 1] # relative importance for missing in [x for x in orig_cols if x not in list(df_varimp.index)]: # h2o3 doesn't handle raw strings all the time, can hit: # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]" df_varimp[missing] = 0 varimp = df_varimp[orig_cols].values # order by fitted features varimp = np.nan_to_num(varimp) self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))
def transcribe_params(self, params=None, **kwargs): if params is None: params = self.params # reference params = params.copy( ) # don't contaminate DAI params, since we know we use lgbm-xgb as base has_eval_set = self.have_eval_set( kwargs) # only needs (and does) operate at fit-time from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType fullspec_regression = inspect.getfullargspec(CatBoostRegressor) kwargs_regression = { k: v for k, v in zip(fullspec_regression.args, fullspec_regression.defaults) } fullspec_classification = inspect.getfullargspec(CatBoostClassifier) kwargs_classification = { k: v for k, v in zip(fullspec_classification.args, fullspec_classification.defaults) } if self.num_classes == 1: allowed_params = kwargs_regression else: allowed_params = kwargs_classification params_copy = copy.deepcopy(params) for k, v in params_copy.items(): if k not in allowed_params.keys(): del params[k] # now transcribe k = 'boosting_type' if k in params: params[k] = 'Plain' k = 'grow_policy' if k in params: params[ k] = 'Depthwise' if params[k] == 'depthwise' else 'Lossguide' k = 'eval_metric' if k in params and params[k] is not None and params[k].upper( ) == 'AUC': params[k] = 'AUC' map = { 'regression': 'RMSE', 'mse': 'RMSE', 'mae': 'MAE', "mape": 'MAPE', "huber": 'Huber', "fair": 'FairLoss', "rmse": "RMSE", "gamma": "RMSE", # unsupported by catboost "tweedie": "Tweedie", "poisson": "Poisson", "quantile": "Quantile", 'binary': 'Logloss', 'auc': 'AUC', "xentropy": 'CrossEntropy', 'multiclass': 'MultiClass' } k = 'objective' if k in params and params[k] in map.keys(): params[k] = map[params[k]] k = 'eval_metric' if k in params and params[k] is not None and params[k] in map.keys(): params[k] = map[params[k]] if 'objective' in params: # don't randomly choose these since then model not stable GA -> final # but backup shouldn't really be used AFAIK if params['objective'] == 'Huber': backup = float(config.huber_alpha_list[0]) params['delta'] = params.pop('alpha', backup) if params['objective'] == 'Quantile': backup = float(config.quantile_alpha[0]) params['delta'] = params.pop('alpha', backup) if params['objective'] == 'Tweedie': backup = float(config.tweedie_variance_power_list[0]) params['tweedie_variance_power'] = params.pop( 'tweedie_variance_power', backup) if params['objective'] == 'FairLoss': backup = float(config.fair_c_list[0]) params['smoothness'] = params.pop('fair_c', backup) params.pop('verbose', None) params.pop('verbose_eval', None) params.pop('logging_level', None) if 'grow_policy' in params: if params['grow_policy'] == 'Lossguide': params.pop('max_depth', None) if params['grow_policy'] == 'Depthwise': params.pop('num_leaves', None) else: params['grow_policy'] = 'SymmetricTree' uses_gpus, n_gpus = self.get_uses_gpus(params) if params['task_type'] == 'CPU': params.pop('grow_policy', None) params.pop('num_leaves', None) params.pop('max_leaves', None) params.pop('min_data_in_leaf', None) params.pop('min_child_samples', None) if params['task_type'] == 'GPU': params.pop('colsample_bylevel', None) # : 0.35 if 'grow_policy' in params and params['grow_policy'] in [ 'Depthwise', 'SymmetricTree' ]: if 'max_depth' in params and params['max_depth'] in [0, -1]: params['max_depth'] = max( 2, int(np.log(params.get('num_leaves', 2**6)))) else: params.pop('max_depth', None) params.pop('depth', None) if 'grow_policy' in params and params['grow_policy'] == 'Lossguide': # if 'num_leaves' in params and params['num_leaves'] == -1: # params['num_leaves'] = 2 ** params.get('max_depth', 6) if 'max_leaves' in params and params['max_leaves'] in [0, -1]: params['max_leaves'] = 2**params.get('max_depth', 6) else: params.pop('max_leaves', None) if 'num_leaves' in params and 'max_leaves' in params: params.pop('num_leaves', None) # apply limits if 'max_leaves' in params: params['max_leaves'] = min(params['max_leaves'], 65536) if 'max_depth' in params: params['max_depth'] = min(params['max_depth'], 16) params.update({ 'train_dir': user_dir(), 'allow_writing_files': False, 'thread_count': self.params_base.get('n_jobs', 4) }) if 'reg_lambda' in params and params['reg_lambda'] <= 0.0: params['reg_lambda'] = 3.0 # assume meant unset if self._can_handle_categorical: if 'max_cat_to_onehot' in params: params['one_hot_max_size'] = params['max_cat_to_onehot'] params.pop('max_cat_to_onehot', None) if uses_gpus: params['one_hot_max_size'] = min( params.get('one_hot_max_size', 255), 255) else: params['one_hot_max_size'] = min( params.get('one_hot_max_size', 65535), 65535) if 'one_hot_max_size' in params: params['one_hot_max_size'] = max(self._min_one_hot_max_size, params['one_hot_max_size']) params['max_bin'] = params.get('max_bin', 254) if params['task_type'] == 'CPU': params['max_bin'] = min( params['max_bin'], 254) # https://github.com/catboost/catboost/issues/1010 if params['task_type'] == 'GPU': params['max_bin'] = min( params['max_bin'], 127) # https://github.com/catboost/catboost/issues/1010 if uses_gpus: # https://catboost.ai/docs/features/training-on-gpu.html params['devices'] = "%d-%d" % (self.params_base.get( 'gpu_id', 0), self.params_base.get('gpu_id', 0) + n_gpus - 1) #params['gpu_ram_part'] = 0.3 # per-GPU, assumes GPU locking or no other experiments running if self.num_classes > 2: params.pop("eval_metric", None) params['train_dir'] = self.context.experiment_tmp_dir params['allow_writing_files'] = False # assume during fit self.params_base could have been updated assert 'n_estimators' in params assert 'learning_rate' in params params['n_estimators'] = self.params_base.get('n_estimators', 100) params['learning_rate'] = self.params_base.get( 'learning_rate', config.min_learning_rate) params['learning_rate'] = min( params['learning_rate'], 0.5) # 1.0 leads to illegal access on GPUs params['learning_rate'] = max( config.min_learning_rate, max(self._min_learning_rate_catboost, params['learning_rate'])) if 'early_stopping_rounds' not in params and has_eval_set: params['early_stopping_rounds'] = 150 # temp fix # assert 'early_stopping_rounds' in params if uses_gpus: params.pop('sampling_frequency', None) if not uses_gpus and params['bootstrap_type'] == 'Poisson': params['bootstrap_type'] = 'Bayesian' # revert to default if uses_gpus and params['bootstrap_type'] == 'MVS': params['bootstrap_type'] = 'Bayesian' # revert to default if 'bootstrap_type' not in params or params['bootstrap_type'] not in [ 'Poisson', 'Bernoulli' ]: params.pop( 'subsample', None) # only allowed for those 2 bootstrap_type settings if params['bootstrap_type'] not in ['Bayesian']: params.pop('bagging_temperature', None) if not (self.num_classes == 2 and params['objective'] == 'Logloss'): params.pop('scale_pos_weight', None) # go back to some default eval_metric if self.num_classes == 1: if 'eval_metric' not in params or params['eval_metric'] not in [ 'MAE', 'MAPE', 'Poisson', 'Quantile', 'RMSE', 'LogLinQuantile', 'Lq', 'Huber', 'Expectile', 'FairLoss', 'NumErrors', 'SMAPE', 'R2', 'MSLE', 'MedianAbsoluteError' ]: params['eval_metric'] = 'RMSE' elif self.num_classes == 2: if 'eval_metric' not in params or params['eval_metric'] not in [ 'Logloss', 'CrossEntropy', 'Precision', 'Recall', 'F1', 'BalancedAccuracy', 'BalancedErrorRate', 'MCC', 'Accuracy', 'CtrFactor', 'AUC', 'NormalizedGini', 'BrierScore', 'HingeLoss', 'HammingLoss', 'ZeroOneLoss', 'Kappa', 'WKappa', 'LogLikelihoodOfPrediction' ]: params['eval_metric'] = 'Logloss' else: if 'eval_metric' not in params or params['eval_metric'] not in [ 'MultiClass', 'MultiClassOneVsAll', 'Precision', 'Recall', 'F1', 'TotalF1', 'MCC', 'Accuracy', 'HingeLoss', 'HammingLoss', 'ZeroOneLoss', 'Kappa', 'WKappa', 'AUC' ]: params['eval_metric'] = 'MultiClass' # set system stuff here params['silent'] = self.params_base.get('silent', True) if config.debug_daimodel_level >= 1: params[ 'silent'] = False # Can enable for tracking improvement in console/dai.log if have access params['random_state'] = self.params_base.get('random_state', 1234) params['thread_count'] = self.params_base.get( 'n_jobs', max(1, physical_cores_count)) # -1 is not supported return params
import os import uuid from collections import OrderedDict from zipfile import ZipFile from h2oaicore.data import CustomData import pandas as pd import datatable as dt from h2oaicore.systemutils import user_dir from h2oaicore.systemutils_more import download tmp_dir = os.path.join(user_dir(), str(uuid.uuid4())[:6]) path_to_zip = "https://files.slack.com/files-pri/T0329MHH6-F0150BK8L01/download/m5-forecasting-accuracy.zip?pub_secret=acfcbf3386" holdout_splits = { 'm5_private': range(1942, 1942 + 28) # private LB } class PrepareM5Data(CustomData): """ Prepare data for m5 Kaggle Time-Series Forecast competition""" @staticmethod def create_data(X: dt.Frame = None): file = download(url=path_to_zip, dest_path=tmp_dir) with ZipFile(file, 'r') as zip_ref: zip_ref.extractall(tmp_dir) num_id_cols = 6 main_data = dt.fread( os.path.join(tmp_dir, "sales_train_evaluation.csv"))
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config if kaggle_username == "XXX" or not kaggle_username: return [] os.putenv("KAGGLE_USERNAME", kaggle_username) os.putenv("KAGGLE_KEY", kaggle_key) # find sample submission file temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) sub_file_dir = os.path.join(temp_path, "kaggle_%s" % str(uuid.uuid4())[:4]) cmd_train = f'kaggle competitions download ' \ f'-c two-sigma-connect-rental-listing-inquiries ' \ f'-f train.json.zip ' \ f'-p {sub_file_dir} -q' cmd_test = f'kaggle competitions download ' \ f'-c two-sigma-connect-rental-listing-inquiries ' \ f'-f test.json.zip ' \ f'-p {sub_file_dir} -q' try: subprocess.check_output(cmd_train.split(), timeout=120).decode("utf-8") except TimeoutError: raise TimeoutError("Took longer than %s seconds, increase timeout") try: subprocess.check_output(cmd_test.split(), timeout=120).decode("utf-8") except TimeoutError: raise TimeoutError("Took longer than %s seconds, increase timeout") train = pd.read_json(os.path.join(sub_file_dir, 'train.json.zip')) test = pd.read_json(os.path.join(sub_file_dir, 'test.json.zip')) for df in [train, test]: df['str_features'] = df['features'].apply(lambda x: ' . '.join(x)) df['nb_features'] = df['features'].apply(len) df['nb_photos'] = df['photos'].apply(len) df['cat_address'] = df['street_address'] + ' ' + df[ 'display_address'] features = [ 'bathrooms', 'bedrooms', 'building_id', 'created', 'description', 'display_address', 'latitude', 'listing_id', 'longitude', 'manager_id', 'price', 'street_address', 'str_features', 'nb_features', 'nb_photos', 'cat_address' ] return { 'two_sigma_train': dt.Frame(train[features + ['interest_level']]), 'two_sigma_test': dt.Frame(test[features]) }
def __init__(self, **kwargs): super().__init__(**kwargs) self.do_stemming = True # turn off as needed self.do_lemmatization = True # turn off as needed self.remove_stopwords = True # turn off as needed import nltk nltk_data_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "nltk_data") nltk_temp_path = os.path.join(user_dir(), "nltk_data") nltk.data.path.append(nltk_data_path) os.makedirs(nltk_data_path, exist_ok=True) nltk_download_lock_file = os.path.join(nltk_data_path, "nltk.lock") with filelock.FileLock(nltk_download_lock_file): nltk.download('stopwords', download_dir=nltk_data_path) nltk.download('punkt', download_dir=nltk_data_path) nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path) nltk.download('maxent_treebank_pos_tagger', download_dir=nltk_data_path) nltk.download('wordnet', download_dir=nltk_data_path) nltk.download('sonoritysequencing', download_dir=nltk_data_path) # download resources for stemming if needed if self.do_stemming: try: self.stemmer = nltk.stem.porter.PorterStemmer() self.stemmer.stem("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tokenizer_path = os.path.join(nltk_data_path, "tokenizers") os.makedirs(tokenizer_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tokenizer_path) self.atomic_copy(file1, tokenizer_path) self.stemmer = nltk.stem.porter.PorterStemmer() self.stemmer.stem("test") # download resources for lemmatization if needed if self.do_lemmatization: try: from nltk.corpus import wordnet self.lemmatizer = nltk.stem.WordNetLemmatizer() self.pos_tagger = nltk.pos_tag self.lemmatizer.lemmatize("test", wordnet.NOUN) self.pos_tagger("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tagger_path = os.path.join(nltk_data_path, "taggers") corpora_path = os.path.join(nltk_data_path, "corpora") os.makedirs(tagger_path, exist_ok=True) os.makedirs(corpora_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip", dest_path=nltk_temp_path) file2 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip", dest_path=nltk_temp_path) file3 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tagger_path) self.unzip_file(file2, tagger_path) self.unzip_file(file3, corpora_path) self.atomic_copy(file1, tagger_path) self.atomic_copy(file2, tagger_path) self.atomic_copy(file3, corpora_path) from nltk.corpus import wordnet self.lemmatizer = nltk.stem.WordNetLemmatizer() self.pos_tagger = nltk.pos_tag self.lemmatizer.lemmatize("test", wordnet.NOUN) self.pos_tagger("test") self.wordnet_map = { "N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV, "O": wordnet.NOUN } # download resources for stopwords if needed if self.remove_stopwords: try: self.stopwords = set(nltk.corpus.stopwords.words('english')) except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) corpora_path = os.path.join(nltk_data_path, "corpora") os.makedirs(corpora_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip", dest_path=nltk_temp_path) self.unzip_file(file1, corpora_path) self.atomic_copy(file1, corpora_path) self.stopwords = set(nltk.corpus.stopwords.words('english'))
def _setup_recipe(): # for DAI 1.7.0 one is required to run this function manually # in DAI >=1.7.1, this function will be run by DAI itself import os from h2oaicore.systemutils_more import extract, download from h2oaicore.systemutils import config, remove from h2oaicore.systemutils import user_dir import shutil from h2oaicore.systemutils_more import arch_type # don't remove this import, setup_recipe parsed-out separately return True # WIP: Disable daal for now in general, just leave recipe floating there for migration purposes if arch_type == "ppc64le": if config.hard_asserts: # in CI testing just ignore return True else: # for user use, raise raise RuntimeError("Cannot use daal on PPC") daal_is_installed_path = os.path.join( user_dir(), config.contrib_env_relative_directory, "daal") daal_is_installed_file = os.path.join(daal_is_installed_path, "daal_is_installed") if not os.path.isfile(daal_is_installed_file): daal_temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "daal") os.makedirs(daal_temp_path, exist_ok=True) prefix = "https://anaconda.org/intel" try: file1 = download( "%s/daal4py/2021.2.0/download/linux-64/daal4py-2021.2.0-py38_intel_358.tar.bz2" % prefix, dest_path=daal_temp_path) file2 = download( "%s/impi_rt/2021.2.0/download/linux-64/impi_rt-2021.2.0-intel_215.tar.bz2" % prefix, dest_path=daal_temp_path) file3 = download( "%s/daal/2021.2.0/download/linux-64/daal-2021.2.0-intel_358.tar.bz2" % prefix, dest_path=daal_temp_path) file4 = download( "https://github.com/intel/daal/releases/download/2019_u4/l_daal_oss_p_2019.4.007.tgz", dest_path=daal_temp_path) except: file1 = download( "https://0xdata-public.s3.amazonaws.com/daal4py-2019.4-py36h7b7c402_6.tar.bz2", dest_path=daal_temp_path) file2 = download( "https://0xdata-public.s3.amazonaws.com/impi_rt-2019.4-intel_243.tar.bz2", dest_path=daal_temp_path) file3 = download( "https://0xdata-public.s3.amazonaws.com/daal-2019.4-intel_243.tar.bz2", dest_path=daal_temp_path) file4 = download( "https://0xdata-public.s3.amazonaws.com/l_daal_oss_p_2019.4.007.tgz", dest_path=daal_temp_path) temp_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "info") os.makedirs(temp_path, exist_ok=True) python_site_packages_path = os.path.join( user_dir(), config.contrib_env_relative_directory) extract(file1, python_site_packages_path) python_site_packages_path2 = os.path.join( user_dir(), config.contrib_env_relative_directory) extract(file2, python_site_packages_path2) extract(file3, python_site_packages_path2) extract(file4, python_site_packages_path2, "gz") other_path = os.path.join(python_site_packages_path2, "lib/libfabric/") import glob for file in glob.glob(os.path.join(other_path, "*.so*")): new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file)) if not os.path.isfile(new_file): shutil.copy(file, new_file) other_path = os.path.join( python_site_packages_path2, "l_daal_oss_p_2019.4.007/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/" ) import glob for file in glob.glob(os.path.join(other_path, "*.so*")): new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file)) if not os.path.isfile(new_file): shutil.copy(file, new_file) os.makedirs(daal_is_installed_path, exist_ok=True) with open(daal_is_installed_file, "wt") as f: f.write("DONE") remove(file1) remove(file2) remove(file3) remove(file4) return True
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get column names orig_cols = list(X.names) from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf import tensorflow as tf import shap import scipy import pandas as pd self.setup_keras_session() import h2oaicore.keras as keras import matplotlib.pyplot as plt if not hasattr(self, 'save_model_path'): model_id = str(uuid.uuid4())[:8] self.save_model_path = os.path.join(user_dir(), "custom_xnn_model.hdf5") np.random.seed(self.random_state) my_init = keras.initializers.RandomUniform(seed=self.random_state) # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folter tmp_folder = self._create_tmp_folder(logger) # define base model def xnn_initialize(features, ridge_functions=3, arch=[20, 12], learning_rate=0.01, bg_samples=100, beta1=0.9, beta2=0.999, dec=0.0, ams=True, bseed=None, is_categorical=False): # # Prepare model architecture # # Input to the network, our observation containing all the features input = keras.layers.Input(shape=(features, ), name='main_input') # Record current column names loggerinfo(logger, "XNN LOG") loggerdata(logger, "Feature list:") loggerdata(logger, str(orig_cols)) # Input to ridge function number i is the dot product of our original input vector times coefficients ridge_input = keras.layers.Dense(ridge_functions, name="projection_layer", activation='linear')(input) ridge_networks = [] # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it ridge_inputs = SplitLayer(ridge_functions)(ridge_input) for i, ridge_input in enumerate(ridge_inputs): # Generate subnetwork i mlp = _mlp(ridge_input, i, arch) ridge_networks.append(mlp) added = keras.layers.Concatenate( name='concatenate_1')(ridge_networks) # Add the correct output layer for the problem if is_categorical: out = keras.layers.Dense(1, activation='sigmoid', input_shape=(ridge_functions, ), name='main_output')(added) else: out = keras.layers.Dense(1, activation='linear', input_shape=(ridge_functions, ), name='main_output')(added) model = keras.models.Model(inputs=input, outputs=out) optimizer = keras.optimizers.Adam(lr=learning_rate, beta_1=beta1, beta_2=beta2, decay=dec, amsgrad=ams) # Use the correct loss for the problem if is_categorical: model.compile(loss={'main_output': 'binary_crossentropy'}, optimizer=optimizer) else: model.compile(loss={'main_output': 'mean_squared_error'}, optimizer=optimizer) return model def _mlp(input, idx, arch=[20, 12], activation='relu'): # Set up a submetwork # Hidden layers mlp = keras.layers.Dense(arch[0], activation=activation, name='mlp_{}_dense_0'.format(idx), kernel_initializer=my_init)(input) for i, layer in enumerate(arch[1:]): mlp = keras.layers.Dense(layer, activation=activation, name='mlp_{}_dense_{}'.format( idx, i + 1), kernel_initializer=my_init)(mlp) # Output of the MLP mlp = keras.layers.Dense( 1, activation='linear', name='mlp_{}_dense_last'.format(idx), kernel_regularizer=keras.regularizers.l1(1e-3), kernel_initializer=my_init)(mlp) return mlp def get_shap(X, model): # Calculate the Shap values np.random.seed(24) bg_samples = min(X.shape[0], 1000) if isinstance(X, pd.DataFrame): background = X.iloc[np.random.choice(X.shape[0], bg_samples, replace=False)] else: background = X[np.random.choice(X.shape[0], bg_samples, replace=False)] # Explain predictions of the model on the subset explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(X) # Return the mean absolute value of each shap value for each dataset xnn_shap = np.abs(shap_values[0]).mean(axis=0) return xnn_shap # Initialize the xnn's features = X.shape[1] orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) self.is_cat = True xnn1 = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) xnn = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) else: self.is_cat = False xnn1 = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) xnn = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) # Replace missing values with a value smaller than all observed values self.min = dict() for col in X.names: XX = X[:, col] self.min[col] = XX.min1() if self.min[col] is None or np.isnan(self.min[col]): self.min[col] = -1e10 else: self.min[col] -= 1 XX.replace(None, self.min[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() inputs = {'main_input': X} validation_set = 0 verbose = 0 # Train the neural network once with early stopping and a validation set history = keras.callbacks.History() es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min') history = xnn1.fit(inputs, y, epochs=self.params["n_estimators"], batch_size=self.params["batch_size"], validation_split=0.3, verbose=verbose, callbacks=[history, es]) # Train again on the full data number_of_epochs_it_ran = len(history.history['loss']) xnn.fit(inputs, y, epochs=number_of_epochs_it_ran, batch_size=self.params["batch_size"], validation_split=0.0, verbose=verbose) # Get the mean absolute Shapley values importances = np.array(get_shap(X, xnn)) int_output = {} int_weights = {} int_bias = {} int_input = {} original_activations = {} x_labels = list(map(lambda x: 'x' + str(x), range(features))) intermediate_output = [] # Record and plot the projection weights # weight_list = [] for layer in xnn.layers: layer_name = layer.get_config()['name'] if layer_name != "main_input": print(layer_name) weights = layer.get_weights() # Record the biases try: bias = layer.get_weights()[1] int_bias[layer_name] = bias except: print("No Bias") # Record outputs for the test set intermediate_layer_model = keras.models.Model( inputs=xnn.input, outputs=xnn.get_layer(layer_name).output) # Record the outputs from the training set if self.is_cat and (layer_name == 'main_output'): original_activations[layer_name] = scipy.special.logit( intermediate_layer_model.predict(X)) original_activations[ layer_name + "_p"] = intermediate_layer_model.predict(X) else: original_activations[ layer_name] = intermediate_layer_model.predict(X) # Record other weights, inputs, and outputs int_weights[layer_name] = weights int_input[layer_name] = layer.input int_output[layer_name] = layer.output # Plot the projection layers if "projection_layer" in layer.get_config()['name']: # print(layer.get_config()['name']) # Record the weights for each projection layer weights = [np.transpose(layer.get_weights()[0])] weight_list2 = [] for i, weight in enumerate(weights[0]): weight_list.append(weight) weight_list2.append( list(np.reshape(weight, (1, features))[0])) # Plot weights plt.bar(orig_cols, abs(np.reshape(weight, (1, features))[0]), 1, color="blue") plt.ylabel("Coefficient value") plt.title("Projection Layer Weights {}".format(i), fontdict={'fontsize': 10}) plt.xticks(rotation=90) plt.show() plt.savefig(os.path.join( tmp_folder, 'projection_layer_' + str(i) + '.png'), bbox_inches="tight") plt.clf() if "main_output" in layer.get_config()['name']: weights_main = layer.get_weights() print(weights_main) pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder, "projection_data.csv"), index=False) intermediate_output = [] for feature_num in range(features): intermediate_layer_model = keras.models.Model( inputs=xnn.input, outputs=xnn.get_layer('mlp_' + str(feature_num) + '_dense_last').output) intermediate_output.append(intermediate_layer_model.predict(X)) # Record and plot the ridge functions ridge_x = [] ridge_y = [] for weight_number in range(len(weight_list)): ridge_x.append( list( sum(X[:, ii] * weight_list[weight_number][ii] for ii in range(features)))) ridge_y.append(list(intermediate_output[weight_number])) plt.plot( sum(X[:, ii] * weight_list[weight_number][ii] for ii in range(features)), intermediate_output[weight_number], 'o') plt.xlabel("Input") plt.ylabel("Subnetwork " + str(weight_number)) plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10}) plt.show() plt.savefig( os.path.join(tmp_folder, 'ridge_' + str(weight_number) + '.png')) plt.clf() # Output the ridge function importance weights2 = np.array([item[0] for item in list(weights)[0]]) output_activations = np.abs( np.array([ item * weights2 for item in list(original_activations["concatenate_1"]) ])).mean(axis=0) loggerinfo(logger, str(output_activations)) pd.DataFrame(output_activations).to_csv(os.path.join( tmp_folder, "ridge_weights.csv"), index=False) plt.bar(x_labels, output_activations, 1, color="blue") plt.xlabel("Ridge function number") plt.ylabel("Feature importance") plt.title("Ridge function importance", fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png')) pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join( tmp_folder, "ridge_y.csv"), index=False) pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"), index=False) pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder, "input_columns.csv"), index=False) self.set_model_properties(model=xnn, features=orig_cols, importances=importances.tolist(), iterations=self.params['n_estimators'])