Exemple #1
0
    def _create_tmp_folder(self, logger):
        # Create a temp folder to store xnn files
        # Set the default value without context available (required to pass acceptance test)
        tmp_folder = os.path.join(user_dir(),
                                  "%s_xnn_model_folder" % uuid.uuid4())
        # Make a real tmp folder when experiment is available
        if self.context and self.context.experiment_id:
            tmp_folder = os.path.join(self.context.experiment_tmp_dir,
                                      "%s_xnn_model_folder" % uuid.uuid4())

        # Now let's try to create that folder
        try:
            os.mkdir(tmp_folder)
        except PermissionError:
            # This not occur so log a warning
            loggerwarning(logger, "XNN was denied temp folder creation rights")
            tmp_folder = os.path.join(user_dir(),
                                      "%s_xnn_model_folder" % uuid.uuid4())
            os.mkdir(tmp_folder)
        except FileExistsError:
            # We should never be here since temp dir name is expected to be unique
            loggerwarning(logger, "XNN temp folder already exists")
            tmp_folder = os.path.join(self.context.experiment_tmp_dir,
                                      "%s_xnn_model_folder" % uuid.uuid4())
            os.mkdir(tmp_folder)
        except:
            # Revert to temporary file path
            tmp_folder = os.path.join(user_dir(),
                                      "%s_xnn_model_folder" % uuid.uuid4())
            os.mkdir(tmp_folder)

        loggerinfo(logger, "XNN temp folder {}".format(tmp_folder))
        return tmp_folder
Exemple #2
0
 def set_tagger(self):
     import nltk
     nltk_data_path = os.path.join(user_dir(),
                                   config.contrib_env_relative_directory,
                                   "nltk_data")
     nltk_temp_path = os.path.join(user_dir(), "nltk_data")
     nltk.data.path.append(nltk_data_path)
     nltk.download('averaged_perceptron_tagger',
                   download_dir=nltk_data_path)
     try:
         self.pos_tagger = nltk.pos_tag
         self.pos_tagger("test")
     except LookupError:
         os.makedirs(nltk_data_path, exist_ok=True)
         os.makedirs(nltk_temp_path, exist_ok=True)
         tagger_path = os.path.join(nltk_data_path, "taggers")
         os.makedirs(tagger_path, exist_ok=True)
         file1 = download(
             "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
             dest_path=nltk_temp_path)
         file2 = download(
             "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
             dest_path=nltk_temp_path)
         self.unzip_file(file1, tagger_path)
         self.unzip_file(file2, tagger_path)
         self.atomic_copy(file1, tagger_path)
         self.atomic_copy(file2, tagger_path)
         self.pos_tagger = nltk.pos_tag
         self.pos_tagger("test")
 def preprocess_image(self, source_img_path, check_only=False):
     try:
         final_img_path = os.path.join(user_dir(), self.uuid,
                                       os.path.basename(source_img_path))
     except:  # we are sometimes getting np.float32, why?
         return None
     delete = False
     if not os.path.exists(final_img_path):
         if not os.path.exists(source_img_path):
             try:
                 self.download(source_img_path, final_img_path)
             except requests.RequestException as e:
                 # print_debug("Error: %s for source_img_path: %s" % (str(e), str(source_img_path)))
                 return None
             delete = False  # True to avoid re-download or a race condition between multiple procs
         else:
             final_img_path = source_img_path
     if not check_only:
         import h2oaicore.keras as keras
         importlib.reload(keras)
         img = keras.preprocessing.image.load_img(final_img_path,
                                                  target_size=(224, 224))
         if delete:
             remove(final_img_path)
         x = keras.preprocessing.image.img_to_array(img)
         x = np.expand_dims(x, axis=0)
         x = keras.applications.resnet50.preprocess_input(x)
         return x
     else:
         return True
Exemple #4
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        lb = LabelEncoder()
        lb.fit(self.labels)
        y = lb.transform(y)
        orig_cols = list(X.names)
        XX = X.to_pandas()
        params = {
            'train_dir': user_dir(),
            'allow_writing_files': False,
            'thread_count': 10,
            # 'loss_function': 'Logloss'
        }
        from catboost import CatBoostClassifier
        model = CatBoostClassifier(**params)
        model.fit(XX,
                  y=y,
                  sample_weight=sample_weight,
                  verbose=False,
                  cat_features=list(X[:, [str, int]].names)
                  )  # Amazon specific, also no early stopping

        # must always set best_iterations
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=model.feature_importances_,
                                  iterations=0)
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        X = self.inf_impute(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        model_file = os.path.join(model_path,
                                  "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_file))
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                return model.predict_contributions(test_frame).as_data_frame(
                    header=False).values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)
            if self.num_classes == 1:
                return preds.values.ravel()
            elif self.num_classes == 2:
                return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model
            h2o.remove(test_frame)
            remove(model_file)
            if preds_frame is not None:
                h2o.remove(preds_frame)
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory,
                                 "airlines")
        os.makedirs(temp_path, exist_ok=True)

        link = AirlinesData.base_url + "1990.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file1 = file.replace(".bz2", "")
        print("%s %s" % (file, output_file1))
        extract_bz2(file, output_file1)

        link = AirlinesData.base_url + "1991.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file2 = file.replace(".bz2", "")
        print("%s %s" % (file, output_file2))
        extract_bz2(file, output_file2)

        return [output_file1, output_file2]
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                return model.predict_contributions(test_frame).as_data_frame(
                    header=False).values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)
            if self.num_classes == 1:
                return preds.values.ravel()
            elif self.num_classes == 2:
                return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            h2o.remove(self.id)
            h2o.remove(test_frame)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Exemple #8
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'}
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)]
        result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {f"imdb_title_ratings": basics_ratings,
                f"imdb_episode_ratings": title_episode_ratings}
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     self.id = None
     self.target = "__target__"
     self.weight = "__weight__"
     self.col_types = None
     self.my_log_dir = os.path.abspath(os.path.join(user_dir(),
                                                    config.contrib_relative_directory, "h2o_log"))
     if not os.path.isdir(self.my_log_dir):
         os.makedirs(self.my_log_dir, exist_ok=True)
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = TestData.url
        file = download(link, dest_path=temp_path)

        return file
Exemple #11
0
 def __init__(self, batch_size=32, **kwargs):
     super().__init__(**kwargs)
     self.batch_size = batch_size
     self.model_name = "resnet_keras.h5p"
     self.uuid = "%s-img-data-" % self.__class__.__name__ + self.model_name  # + str(uuid.uuid4())[:6] # no, keeps changing and re-loadeing every init
     self.uuid_tmp = str(uuid.uuid4())[:6]
     self.col_name = self.input_feature_names[0]
     self.model_path = os.path.join(user_dir(), self.uuid + ".model")
     self.model_tmp_path = self.model_path + "_" + self.uuid_tmp + ".tmp"
     if not os.path.exists(self.model_path):
         self.download(
             url="http://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/recipes/transformers/img/%s" % self.model_name,
             dest=self.model_path)
     with open(self.model_path, 'rb') as f:
         self.model_bytes = f.read()
Exemple #12
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
Exemple #13
0
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = "http://data.un.org/_Docs/SYB/CSV/SYB63_226_202009_Net%20Disbursements%20from%20Official%20ODA%20to%20Recipients.csv"
        output_file1 = download(link, dest_path=temp_path)

        link = "http://data.un.org/_Docs/SYB/CSV/SYB63_223_202009_Net%20Disbursements%20from%20Official%20ODA%20from%20Donors.csv"
        output_file2 = download(link, dest_path=temp_path)

        return [output_file1, output_file2]
Exemple #14
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        # exit gracefully if method is called as a data upload rather than data modify
        if X is None:
            return []
        import os
        from h2oaicore.systemutils import config

        # Change to pandas -> we can rewrite this as dt at a later date
        rain_raw = X.to_pandas()

        # Set index and pivot the data
        # Rows go from one row each month to one row each month & gauge
        rain_raw = rain_raw.set_index("date")
        rain_pivot = rain_raw.unstack().reset_index(name="rain_inches")
        rain_pivot.rename(columns={
            'level_0': 'rain_gauge',
            'date': 'end_of_month'
        },
                          inplace=True)

        # Format date appropriately
        rain_pivot['end_of_month'] = pd.to_datetime(rain_pivot['end_of_month'])

        # Split data into train and test by date
        # Train on 7 years of data, test on 1 year of data
        train_py = rain_pivot[(rain_pivot['end_of_month'] >= '2009-01-01')
                              & (rain_pivot['end_of_month'] <= '2016-01-01')]
        test_py = rain_pivot[rain_pivot['end_of_month'].dt.year == 2016]

        # Set up to save to disk
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # Save files to disk
        file_train = os.path.join(temp_path, "seattle_rain_train.csv")
        train_py.to_csv(file_train)
        file_test = os.path.join(temp_path, "seattle_rain_test.csv")
        test_py.to_csv(file_test)

        return [file_train, file_test]
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory,
                                 "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
        output_file1 = download(link, dest_path=temp_path)

        link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
        output_file2 = download(link, dest_path=temp_path)

        return [output_file1, output_file2]
Exemple #16
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(user_dir(), self.id)
        model_file = os.path.join(model_path,
                                  "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_file))
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            remove(model_file)
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
Exemple #17
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     h2o.init(port=config.h2o_recipes_port)
     model = H2OAutoEncoderEstimator(activation='tanh',
                                     epochs=1,
                                     hidden=[50, 50],
                                     reproducible=True,
                                     seed=1234)
     frame = h2o.H2OFrame(X.to_pandas())
     model_path = None
     try:
         model.train(x=list(range(X.ncols)), training_frame=frame)
         self.id = model.model_id
         model_path = os.path.join(user_dir(),
                                   "h2o_model." + str(uuid.uuid4()))
         model_path = h2o.save_model(model=model, path=model_path)
         with open(model_path, "rb") as f:
             self.raw_model_bytes = f.read()
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         if model_path is not None:
             remove(model_path)
         h2o.remove(model)
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)

            return preds.values.ravel()

        finally:
            h2o.remove(self.id)
            h2o.remove(test_frame)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Exemple #19
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link = "https://data.seattle.gov/resource/rdtp-hzy3.csv"

        # Download the file
        file = download(link, dest_path=temp_path)

        # Give the file a descriptive name for the UI
        output_file = file.replace("rdtp-hzy3", "seattle_monthly_rain_raw")
        os.rename(file, output_file)

        # Return the location on the DAI server for this data set
        return output_file
Exemple #20
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines")
        os.makedirs(temp_path, exist_ok=True)
        dt.options.nthreads = 8

        # specify which years are used for training and testing
        training = list(range(2005, 2008))
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = AirlinesData.base_url + "%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            if not os.path.exists(output_file):
                extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download(
                'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' %
                file,
                dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = True
        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_data_recipe_%d-%d.csv" %
                (min(training), max(testing)))
            X.to_csv(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(
                    temp_path, "augmented_flights_%s-%d_%s.csv" %
                    (X_split[:, 'Year'].min1(), X_split[:,
                                                        'Year'].max1(), name))
                X_split.to_csv(filename)
                output_files.append(filename)
            return output_files
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)
        X = self.inf_impute(X)
        self.transcribe(X=X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None and len(
                    sample_weight_eval_set
            ) > 0 and sample_weight_eval_set[0] is not None:
                sample_weight_eval_set1 = sample_weight_eval_set[0]
                sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1
                sample_weight_eval_set1 = sample_weight_eval_set1.astype(int)
                sample_weight_eval_set = [sample_weight_eval_set1]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(
            y,
            column_names=[self.target],
            column_types=[
                'categorical' if self.num_classes >= 2 else 'numeric'
            ])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(),
                                   column_types=self.col_types)
            valid_y = h2o.H2OFrame(
                eval_set[0][1],
                column_names=[self.target],
                column_types=[
                    'categorical' if self.num_classes >= 2 else 'numeric'
                ])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs', 0)
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            trials = 2
            for trial in range(0, trials):
                try:
                    # Models that can use an offset column
                    model = self.make_instance(**params)
                    if isinstance(model, H2OGBMModel) | isinstance(
                            model, H2ODLModel) | isinstance(
                                model, H2OGLMModel):
                        model.train(x=cols_to_train,
                                    y=self.target,
                                    training_frame=train_frame,
                                    offset_column=offset_col,
                                    **train_kwargs)
                    else:
                        model.train(x=train_X.names,
                                    y=self.target,
                                    training_frame=train_frame,
                                    **train_kwargs)
                    break
                except Exception as e:
                    print(str(e))
                    t, v, tb = sys.exc_info()
                    ex = ''.join(traceback.format_exception(t, v, tb))
                    if 'Training data must have at least 2 features' in str(
                            ex) and X.ncols != 0:
                        # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case
                        raise IgnoreEntirelyError
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(
                            e):
                        # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows
                        params['min_rows'] = 1  # go down to lowest value
                        # permit another trial
                    else:
                        raise
                    if trial == trials - 1:
                        # if at end of trials, raise no matter what
                        raise

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(),
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [
                    train_frame, train_X, train_y, model, valid_frame, valid_X,
                    valid_y
            ]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [
                    x for x in orig_cols if x not in list(df_varimp.index)
            ]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None:
                sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs')
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**params)

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            # Models that can use an offset column
            if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel):
                model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col,
                            **train_kwargs)
            else:
                model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [x for x in orig_cols if x not in list(df_varimp.index)]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        orig_cols = list(X.names)
        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()

            max_runtime_secs = self.params.get('max_runtime_secs', 0)
            train_kwargs = dict(max_runtime_secs=max_runtime_secs)

            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**self.params)
            model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)
            self.id = model.model_id
            model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [x for x in orig_cols if x not in list(df_varimp.index)]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
    def transcribe_params(self, params=None, **kwargs):
        if params is None:
            params = self.params  # reference
        params = params.copy(
        )  # don't contaminate DAI params, since we know we use lgbm-xgb as base

        has_eval_set = self.have_eval_set(
            kwargs)  # only needs (and does) operate at fit-time
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
        fullspec_regression = inspect.getfullargspec(CatBoostRegressor)
        kwargs_regression = {
            k: v
            for k, v in zip(fullspec_regression.args,
                            fullspec_regression.defaults)
        }
        fullspec_classification = inspect.getfullargspec(CatBoostClassifier)
        kwargs_classification = {
            k: v
            for k, v in zip(fullspec_classification.args,
                            fullspec_classification.defaults)
        }

        if self.num_classes == 1:
            allowed_params = kwargs_regression
        else:
            allowed_params = kwargs_classification

        params_copy = copy.deepcopy(params)
        for k, v in params_copy.items():
            if k not in allowed_params.keys():
                del params[k]

        # now transcribe
        k = 'boosting_type'
        if k in params:
            params[k] = 'Plain'

        k = 'grow_policy'
        if k in params:
            params[
                k] = 'Depthwise' if params[k] == 'depthwise' else 'Lossguide'

        k = 'eval_metric'
        if k in params and params[k] is not None and params[k].upper(
        ) == 'AUC':
            params[k] = 'AUC'

        map = {
            'regression': 'RMSE',
            'mse': 'RMSE',
            'mae': 'MAE',
            "mape": 'MAPE',
            "huber": 'Huber',
            "fair": 'FairLoss',
            "rmse": "RMSE",
            "gamma": "RMSE",  # unsupported by catboost
            "tweedie": "Tweedie",
            "poisson": "Poisson",
            "quantile": "Quantile",
            'binary': 'Logloss',
            'auc': 'AUC',
            "xentropy": 'CrossEntropy',
            'multiclass': 'MultiClass'
        }

        k = 'objective'
        if k in params and params[k] in map.keys():
            params[k] = map[params[k]]

        k = 'eval_metric'
        if k in params and params[k] is not None and params[k] in map.keys():
            params[k] = map[params[k]]

        if 'objective' in params:
            # don't randomly choose these since then model not stable GA -> final
            # but backup shouldn't really be used AFAIK
            if params['objective'] == 'Huber':
                backup = float(config.huber_alpha_list[0])
                params['delta'] = params.pop('alpha', backup)
            if params['objective'] == 'Quantile':
                backup = float(config.quantile_alpha[0])
                params['delta'] = params.pop('alpha', backup)
            if params['objective'] == 'Tweedie':
                backup = float(config.tweedie_variance_power_list[0])
                params['tweedie_variance_power'] = params.pop(
                    'tweedie_variance_power', backup)
            if params['objective'] == 'FairLoss':
                backup = float(config.fair_c_list[0])
                params['smoothness'] = params.pop('fair_c', backup)

        params.pop('verbose', None)
        params.pop('verbose_eval', None)
        params.pop('logging_level', None)

        if 'grow_policy' in params:
            if params['grow_policy'] == 'Lossguide':
                params.pop('max_depth', None)
            if params['grow_policy'] == 'Depthwise':
                params.pop('num_leaves', None)
        else:
            params['grow_policy'] = 'SymmetricTree'

        uses_gpus, n_gpus = self.get_uses_gpus(params)

        if params['task_type'] == 'CPU':
            params.pop('grow_policy', None)
            params.pop('num_leaves', None)
            params.pop('max_leaves', None)
            params.pop('min_data_in_leaf', None)
            params.pop('min_child_samples', None)

        if params['task_type'] == 'GPU':
            params.pop('colsample_bylevel', None)  # : 0.35

        if 'grow_policy' in params and params['grow_policy'] in [
                'Depthwise', 'SymmetricTree'
        ]:
            if 'max_depth' in params and params['max_depth'] in [0, -1]:
                params['max_depth'] = max(
                    2, int(np.log(params.get('num_leaves', 2**6))))
        else:
            params.pop('max_depth', None)
            params.pop('depth', None)
        if 'grow_policy' in params and params['grow_policy'] == 'Lossguide':
            # if 'num_leaves' in params and params['num_leaves'] == -1:
            #    params['num_leaves'] = 2 ** params.get('max_depth', 6)
            if 'max_leaves' in params and params['max_leaves'] in [0, -1]:
                params['max_leaves'] = 2**params.get('max_depth', 6)
        else:
            params.pop('max_leaves', None)
        if 'num_leaves' in params and 'max_leaves' in params:
            params.pop('num_leaves', None)
        # apply limits
        if 'max_leaves' in params:
            params['max_leaves'] = min(params['max_leaves'], 65536)
        if 'max_depth' in params:
            params['max_depth'] = min(params['max_depth'], 16)

        params.update({
            'train_dir': user_dir(),
            'allow_writing_files': False,
            'thread_count': self.params_base.get('n_jobs', 4)
        })

        if 'reg_lambda' in params and params['reg_lambda'] <= 0.0:
            params['reg_lambda'] = 3.0  # assume meant unset

        if self._can_handle_categorical:
            if 'max_cat_to_onehot' in params:
                params['one_hot_max_size'] = params['max_cat_to_onehot']
                params.pop('max_cat_to_onehot', None)
            if uses_gpus:
                params['one_hot_max_size'] = min(
                    params.get('one_hot_max_size', 255), 255)
            else:
                params['one_hot_max_size'] = min(
                    params.get('one_hot_max_size', 65535), 65535)

        if 'one_hot_max_size' in params:
            params['one_hot_max_size'] = max(self._min_one_hot_max_size,
                                             params['one_hot_max_size'])

        params['max_bin'] = params.get('max_bin', 254)
        if params['task_type'] == 'CPU':
            params['max_bin'] = min(
                params['max_bin'],
                254)  # https://github.com/catboost/catboost/issues/1010
        if params['task_type'] == 'GPU':
            params['max_bin'] = min(
                params['max_bin'],
                127)  # https://github.com/catboost/catboost/issues/1010

        if uses_gpus:
            # https://catboost.ai/docs/features/training-on-gpu.html
            params['devices'] = "%d-%d" % (self.params_base.get(
                'gpu_id', 0), self.params_base.get('gpu_id', 0) + n_gpus - 1)
            #params['gpu_ram_part'] = 0.3  # per-GPU, assumes GPU locking or no other experiments running

        if self.num_classes > 2:
            params.pop("eval_metric", None)

        params['train_dir'] = self.context.experiment_tmp_dir
        params['allow_writing_files'] = False

        # assume during fit self.params_base could have been updated
        assert 'n_estimators' in params
        assert 'learning_rate' in params
        params['n_estimators'] = self.params_base.get('n_estimators', 100)
        params['learning_rate'] = self.params_base.get(
            'learning_rate', config.min_learning_rate)
        params['learning_rate'] = min(
            params['learning_rate'],
            0.5)  # 1.0 leads to illegal access on GPUs
        params['learning_rate'] = max(
            config.min_learning_rate,
            max(self._min_learning_rate_catboost, params['learning_rate']))
        if 'early_stopping_rounds' not in params and has_eval_set:
            params['early_stopping_rounds'] = 150  # temp fix
            # assert 'early_stopping_rounds' in params

        if uses_gpus:
            params.pop('sampling_frequency', None)

        if not uses_gpus and params['bootstrap_type'] == 'Poisson':
            params['bootstrap_type'] = 'Bayesian'  # revert to default
        if uses_gpus and params['bootstrap_type'] == 'MVS':
            params['bootstrap_type'] = 'Bayesian'  # revert to default

        if 'bootstrap_type' not in params or params['bootstrap_type'] not in [
                'Poisson', 'Bernoulli'
        ]:
            params.pop(
                'subsample',
                None)  # only allowed for those 2 bootstrap_type settings

        if params['bootstrap_type'] not in ['Bayesian']:
            params.pop('bagging_temperature', None)

        if not (self.num_classes == 2 and params['objective'] == 'Logloss'):
            params.pop('scale_pos_weight', None)

        # go back to some default eval_metric
        if self.num_classes == 1:
            if 'eval_metric' not in params or params['eval_metric'] not in [
                    'MAE', 'MAPE', 'Poisson', 'Quantile', 'RMSE',
                    'LogLinQuantile', 'Lq', 'Huber', 'Expectile', 'FairLoss',
                    'NumErrors', 'SMAPE', 'R2', 'MSLE', 'MedianAbsoluteError'
            ]:
                params['eval_metric'] = 'RMSE'
        elif self.num_classes == 2:
            if 'eval_metric' not in params or params['eval_metric'] not in [
                    'Logloss', 'CrossEntropy', 'Precision', 'Recall', 'F1',
                    'BalancedAccuracy', 'BalancedErrorRate', 'MCC', 'Accuracy',
                    'CtrFactor', 'AUC', 'NormalizedGini', 'BrierScore',
                    'HingeLoss', 'HammingLoss', 'ZeroOneLoss', 'Kappa',
                    'WKappa', 'LogLikelihoodOfPrediction'
            ]:
                params['eval_metric'] = 'Logloss'
        else:
            if 'eval_metric' not in params or params['eval_metric'] not in [
                    'MultiClass', 'MultiClassOneVsAll', 'Precision', 'Recall',
                    'F1', 'TotalF1', 'MCC', 'Accuracy', 'HingeLoss',
                    'HammingLoss', 'ZeroOneLoss', 'Kappa', 'WKappa', 'AUC'
            ]:
                params['eval_metric'] = 'MultiClass'

        # set system stuff here
        params['silent'] = self.params_base.get('silent', True)
        if config.debug_daimodel_level >= 1:
            params[
                'silent'] = False  # Can enable for tracking improvement in console/dai.log if have access
        params['random_state'] = self.params_base.get('random_state', 1234)
        params['thread_count'] = self.params_base.get(
            'n_jobs', max(1, physical_cores_count))  # -1 is not supported

        return params
import os
import uuid
from collections import OrderedDict
from zipfile import ZipFile

from h2oaicore.data import CustomData

import pandas as pd
import datatable as dt

from h2oaicore.systemutils import user_dir
from h2oaicore.systemutils_more import download
tmp_dir = os.path.join(user_dir(), str(uuid.uuid4())[:6])
path_to_zip = "https://files.slack.com/files-pri/T0329MHH6-F0150BK8L01/download/m5-forecasting-accuracy.zip?pub_secret=acfcbf3386"

holdout_splits = {
    'm5_private': range(1942, 1942 + 28)  # private LB
}


class PrepareM5Data(CustomData):
    """ Prepare data for m5 Kaggle Time-Series Forecast competition"""
    @staticmethod
    def create_data(X: dt.Frame = None):
        file = download(url=path_to_zip, dest_path=tmp_dir)
        with ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(tmp_dir)

        num_id_cols = 6
        main_data = dt.fread(
            os.path.join(tmp_dir, "sales_train_evaluation.csv"))
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        if kaggle_username == "XXX" or not kaggle_username:
            return []

        os.putenv("KAGGLE_USERNAME", kaggle_username)
        os.putenv("KAGGLE_KEY", kaggle_key)

        # find sample submission file
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)
        sub_file_dir = os.path.join(temp_path,
                                    "kaggle_%s" % str(uuid.uuid4())[:4])

        cmd_train = f'kaggle competitions download ' \
            f'-c two-sigma-connect-rental-listing-inquiries ' \
            f'-f train.json.zip ' \
            f'-p {sub_file_dir} -q'
        cmd_test = f'kaggle competitions download ' \
            f'-c two-sigma-connect-rental-listing-inquiries ' \
            f'-f test.json.zip ' \
            f'-p {sub_file_dir} -q'

        try:
            subprocess.check_output(cmd_train.split(),
                                    timeout=120).decode("utf-8")
        except TimeoutError:
            raise TimeoutError("Took longer than %s seconds, increase timeout")

        try:
            subprocess.check_output(cmd_test.split(),
                                    timeout=120).decode("utf-8")
        except TimeoutError:
            raise TimeoutError("Took longer than %s seconds, increase timeout")

        train = pd.read_json(os.path.join(sub_file_dir, 'train.json.zip'))
        test = pd.read_json(os.path.join(sub_file_dir, 'test.json.zip'))

        for df in [train, test]:
            df['str_features'] = df['features'].apply(lambda x: ' . '.join(x))
            df['nb_features'] = df['features'].apply(len)
            df['nb_photos'] = df['photos'].apply(len)
            df['cat_address'] = df['street_address'] + ' ' + df[
                'display_address']

        features = [
            'bathrooms', 'bedrooms', 'building_id', 'created', 'description',
            'display_address', 'latitude', 'listing_id', 'longitude',
            'manager_id', 'price', 'street_address', 'str_features',
            'nb_features', 'nb_photos', 'cat_address'
        ]

        return {
            'two_sigma_train': dt.Frame(train[features + ['interest_level']]),
            'two_sigma_test': dt.Frame(test[features])
        }
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.do_stemming = True  # turn off as needed
        self.do_lemmatization = True  # turn off as needed
        self.remove_stopwords = True  # turn off as needed

        import nltk
        nltk_data_path = os.path.join(user_dir(),
                                      config.contrib_env_relative_directory,
                                      "nltk_data")
        nltk_temp_path = os.path.join(user_dir(), "nltk_data")
        nltk.data.path.append(nltk_data_path)
        os.makedirs(nltk_data_path, exist_ok=True)
        nltk_download_lock_file = os.path.join(nltk_data_path, "nltk.lock")
        with filelock.FileLock(nltk_download_lock_file):
            nltk.download('stopwords', download_dir=nltk_data_path)
            nltk.download('punkt', download_dir=nltk_data_path)
            nltk.download('averaged_perceptron_tagger',
                          download_dir=nltk_data_path)
            nltk.download('maxent_treebank_pos_tagger',
                          download_dir=nltk_data_path)
            nltk.download('wordnet', download_dir=nltk_data_path)
            nltk.download('sonoritysequencing', download_dir=nltk_data_path)

        # download resources for stemming if needed
        if self.do_stemming:
            try:
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tokenizer_path = os.path.join(nltk_data_path, "tokenizers")
                os.makedirs(tokenizer_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tokenizer_path)
                self.atomic_copy(file1, tokenizer_path)
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")

        # download resources for lemmatization if needed
        if self.do_lemmatization:
            try:
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tagger_path = os.path.join(nltk_data_path, "taggers")
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(tagger_path, exist_ok=True)
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
                    dest_path=nltk_temp_path)
                file2 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
                    dest_path=nltk_temp_path)
                file3 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tagger_path)
                self.unzip_file(file2, tagger_path)
                self.unzip_file(file3, corpora_path)
                self.atomic_copy(file1, tagger_path)
                self.atomic_copy(file2, tagger_path)
                self.atomic_copy(file3, corpora_path)
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            self.wordnet_map = {
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "J": wordnet.ADJ,
                "R": wordnet.ADV,
                "O": wordnet.NOUN
            }

        # download resources for stopwords if needed
        if self.remove_stopwords:
            try:
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, corpora_path)
                self.atomic_copy(file1, corpora_path)
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
Exemple #28
0
def _setup_recipe():
    # for DAI 1.7.0 one is required to run this function manually
    # in DAI >=1.7.1, this function will be run by DAI itself
    import os
    from h2oaicore.systemutils_more import extract, download
    from h2oaicore.systemutils import config, remove
    from h2oaicore.systemutils import user_dir
    import shutil

    from h2oaicore.systemutils_more import arch_type  # don't remove this import, setup_recipe parsed-out separately
    return True  # WIP: Disable daal for now in general, just leave recipe floating there for migration purposes
    if arch_type == "ppc64le":
        if config.hard_asserts:
            # in CI testing just ignore
            return True
        else:
            # for user use, raise
            raise RuntimeError("Cannot use daal on PPC")

    daal_is_installed_path = os.path.join(
        user_dir(), config.contrib_env_relative_directory, "daal")
    daal_is_installed_file = os.path.join(daal_is_installed_path,
                                          "daal_is_installed")
    if not os.path.isfile(daal_is_installed_file):
        daal_temp_path = os.path.join(user_dir(),
                                      config.contrib_relative_directory,
                                      "daal")
        os.makedirs(daal_temp_path, exist_ok=True)
        prefix = "https://anaconda.org/intel"
        try:
            file1 = download(
                "%s/daal4py/2021.2.0/download/linux-64/daal4py-2021.2.0-py38_intel_358.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file2 = download(
                "%s/impi_rt/2021.2.0/download/linux-64/impi_rt-2021.2.0-intel_215.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file3 = download(
                "%s/daal/2021.2.0/download/linux-64/daal-2021.2.0-intel_358.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file4 = download(
                "https://github.com/intel/daal/releases/download/2019_u4/l_daal_oss_p_2019.4.007.tgz",
                dest_path=daal_temp_path)
        except:
            file1 = download(
                "https://0xdata-public.s3.amazonaws.com/daal4py-2019.4-py36h7b7c402_6.tar.bz2",
                dest_path=daal_temp_path)
            file2 = download(
                "https://0xdata-public.s3.amazonaws.com/impi_rt-2019.4-intel_243.tar.bz2",
                dest_path=daal_temp_path)
            file3 = download(
                "https://0xdata-public.s3.amazonaws.com/daal-2019.4-intel_243.tar.bz2",
                dest_path=daal_temp_path)
            file4 = download(
                "https://0xdata-public.s3.amazonaws.com/l_daal_oss_p_2019.4.007.tgz",
                dest_path=daal_temp_path)
        temp_path = os.path.join(user_dir(),
                                 config.contrib_env_relative_directory, "info")
        os.makedirs(temp_path, exist_ok=True)
        python_site_packages_path = os.path.join(
            user_dir(), config.contrib_env_relative_directory)
        extract(file1, python_site_packages_path)
        python_site_packages_path2 = os.path.join(
            user_dir(), config.contrib_env_relative_directory)
        extract(file2, python_site_packages_path2)
        extract(file3, python_site_packages_path2)
        extract(file4, python_site_packages_path2, "gz")

        other_path = os.path.join(python_site_packages_path2, "lib/libfabric/")
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)

        other_path = os.path.join(
            python_site_packages_path2,
            "l_daal_oss_p_2019.4.007/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/"
        )
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)
        os.makedirs(daal_is_installed_path, exist_ok=True)
        with open(daal_is_installed_file, "wt") as f:
            f.write("DONE")
        remove(file1)
        remove(file2)
        remove(file3)
        remove(file4)
        return True
Exemple #29
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get column names
        orig_cols = list(X.names)

        from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf
        import tensorflow as tf
        import shap
        import scipy
        import pandas as pd

        self.setup_keras_session()

        import h2oaicore.keras as keras
        import matplotlib.pyplot as plt

        if not hasattr(self, 'save_model_path'):
            model_id = str(uuid.uuid4())[:8]
            self.save_model_path = os.path.join(user_dir(),
                                                "custom_xnn_model.hdf5")

        np.random.seed(self.random_state)

        my_init = keras.initializers.RandomUniform(seed=self.random_state)

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # define base model
        def xnn_initialize(features,
                           ridge_functions=3,
                           arch=[20, 12],
                           learning_rate=0.01,
                           bg_samples=100,
                           beta1=0.9,
                           beta2=0.999,
                           dec=0.0,
                           ams=True,
                           bseed=None,
                           is_categorical=False):

            #
            # Prepare model architecture
            #
            # Input to the network, our observation containing all the features
            input = keras.layers.Input(shape=(features, ), name='main_input')

            # Record current column names
            loggerinfo(logger, "XNN LOG")
            loggerdata(logger, "Feature list:")
            loggerdata(logger, str(orig_cols))

            # Input to ridge function number i is the dot product of our original input vector times coefficients
            ridge_input = keras.layers.Dense(ridge_functions,
                                             name="projection_layer",
                                             activation='linear')(input)

            ridge_networks = []
            # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it
            ridge_inputs = SplitLayer(ridge_functions)(ridge_input)
            for i, ridge_input in enumerate(ridge_inputs):
                # Generate subnetwork i
                mlp = _mlp(ridge_input, i, arch)
                ridge_networks.append(mlp)

            added = keras.layers.Concatenate(
                name='concatenate_1')(ridge_networks)

            # Add the correct output layer for the problem
            if is_categorical:
                out = keras.layers.Dense(1,
                                         activation='sigmoid',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)
            else:
                out = keras.layers.Dense(1,
                                         activation='linear',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)

            model = keras.models.Model(inputs=input, outputs=out)

            optimizer = keras.optimizers.Adam(lr=learning_rate,
                                              beta_1=beta1,
                                              beta_2=beta2,
                                              decay=dec,
                                              amsgrad=ams)

            # Use the correct loss for the problem
            if is_categorical:
                model.compile(loss={'main_output': 'binary_crossentropy'},
                              optimizer=optimizer)
            else:
                model.compile(loss={'main_output': 'mean_squared_error'},
                              optimizer=optimizer)

            return model

        def _mlp(input, idx, arch=[20, 12], activation='relu'):
            # Set up a submetwork

            # Hidden layers
            mlp = keras.layers.Dense(arch[0],
                                     activation=activation,
                                     name='mlp_{}_dense_0'.format(idx),
                                     kernel_initializer=my_init)(input)
            for i, layer in enumerate(arch[1:]):
                mlp = keras.layers.Dense(layer,
                                         activation=activation,
                                         name='mlp_{}_dense_{}'.format(
                                             idx, i + 1),
                                         kernel_initializer=my_init)(mlp)

            # Output of the MLP
            mlp = keras.layers.Dense(
                1,
                activation='linear',
                name='mlp_{}_dense_last'.format(idx),
                kernel_regularizer=keras.regularizers.l1(1e-3),
                kernel_initializer=my_init)(mlp)
            return mlp

        def get_shap(X, model):
            # Calculate the Shap values
            np.random.seed(24)
            bg_samples = min(X.shape[0], 1000)

            if isinstance(X, pd.DataFrame):
                background = X.iloc[np.random.choice(X.shape[0],
                                                     bg_samples,
                                                     replace=False)]
            else:
                background = X[np.random.choice(X.shape[0],
                                                bg_samples,
                                                replace=False)]

            # Explain predictions of the model on the subset
            explainer = shap.DeepExplainer(model, background)
            shap_values = explainer.shap_values(X)

            # Return the mean absolute value of each shap value for each dataset
            xnn_shap = np.abs(shap_values[0]).mean(axis=0)

            return xnn_shap

        # Initialize the xnn's
        features = X.shape[1]
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            self.is_cat = True
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)
        else:
            self.is_cat = False
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        inputs = {'main_input': X}
        validation_set = 0
        verbose = 0

        # Train the neural network once with early stopping and a validation set
        history = keras.callbacks.History()
        es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')

        history = xnn1.fit(inputs,
                           y,
                           epochs=self.params["n_estimators"],
                           batch_size=self.params["batch_size"],
                           validation_split=0.3,
                           verbose=verbose,
                           callbacks=[history, es])

        # Train again on the full data
        number_of_epochs_it_ran = len(history.history['loss'])

        xnn.fit(inputs,
                y,
                epochs=number_of_epochs_it_ran,
                batch_size=self.params["batch_size"],
                validation_split=0.0,
                verbose=verbose)

        # Get the mean absolute Shapley values
        importances = np.array(get_shap(X, xnn))

        int_output = {}
        int_weights = {}
        int_bias = {}
        int_input = {}

        original_activations = {}

        x_labels = list(map(lambda x: 'x' + str(x), range(features)))

        intermediate_output = []

        # Record and plot the projection weights
        #
        weight_list = []
        for layer in xnn.layers:

            layer_name = layer.get_config()['name']
            if layer_name != "main_input":
                print(layer_name)
                weights = layer.get_weights()

                # Record the biases
                try:
                    bias = layer.get_weights()[1]
                    int_bias[layer_name] = bias
                except:
                    print("No Bias")

                # Record outputs for the test set
                intermediate_layer_model = keras.models.Model(
                    inputs=xnn.input, outputs=xnn.get_layer(layer_name).output)

                # Record the outputs from the training set
                if self.is_cat and (layer_name == 'main_output'):
                    original_activations[layer_name] = scipy.special.logit(
                        intermediate_layer_model.predict(X))
                    original_activations[
                        layer_name +
                        "_p"] = intermediate_layer_model.predict(X)
                else:
                    original_activations[
                        layer_name] = intermediate_layer_model.predict(X)

                    # Record other weights, inputs, and outputs
                int_weights[layer_name] = weights
                int_input[layer_name] = layer.input
                int_output[layer_name] = layer.output

            # Plot the projection layers
            if "projection_layer" in layer.get_config()['name']:

                # print(layer.get_config()['name'])

                # Record the weights for each projection layer
                weights = [np.transpose(layer.get_weights()[0])]

                weight_list2 = []
                for i, weight in enumerate(weights[0]):
                    weight_list.append(weight)
                    weight_list2.append(
                        list(np.reshape(weight, (1, features))[0]))

                    # Plot weights
                    plt.bar(orig_cols,
                            abs(np.reshape(weight, (1, features))[0]),
                            1,
                            color="blue")
                    plt.ylabel("Coefficient value")
                    plt.title("Projection Layer Weights {}".format(i),
                              fontdict={'fontsize': 10})
                    plt.xticks(rotation=90)
                    plt.show()
                    plt.savefig(os.path.join(
                        tmp_folder, 'projection_layer_' + str(i) + '.png'),
                                bbox_inches="tight")
                    plt.clf()

            if "main_output" in layer.get_config()['name']:
                weights_main = layer.get_weights()
                print(weights_main)

        pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder,
                                                       "projection_data.csv"),
                                          index=False)

        intermediate_output = []

        for feature_num in range(features):
            intermediate_layer_model = keras.models.Model(
                inputs=xnn.input,
                outputs=xnn.get_layer('mlp_' + str(feature_num) +
                                      '_dense_last').output)
            intermediate_output.append(intermediate_layer_model.predict(X))

        # Record and plot the ridge functions
        ridge_x = []
        ridge_y = []
        for weight_number in range(len(weight_list)):
            ridge_x.append(
                list(
                    sum(X[:, ii] * weight_list[weight_number][ii]
                        for ii in range(features))))
            ridge_y.append(list(intermediate_output[weight_number]))

            plt.plot(
                sum(X[:, ii] * weight_list[weight_number][ii]
                    for ii in range(features)),
                intermediate_output[weight_number], 'o')
            plt.xlabel("Input")
            plt.ylabel("Subnetwork " + str(weight_number))
            plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(
                os.path.join(tmp_folder,
                             'ridge_' + str(weight_number) + '.png'))
            plt.clf()

        # Output the ridge function importance
        weights2 = np.array([item[0] for item in list(weights)[0]])

        output_activations = np.abs(
            np.array([
                item * weights2
                for item in list(original_activations["concatenate_1"])
            ])).mean(axis=0)
        loggerinfo(logger, str(output_activations))
        pd.DataFrame(output_activations).to_csv(os.path.join(
            tmp_folder, "ridge_weights.csv"),
                                                index=False)

        plt.bar(x_labels, output_activations, 1, color="blue")
        plt.xlabel("Ridge function number")
        plt.ylabel("Feature importance")
        plt.title("Ridge function importance", fontdict={'fontsize': 10})
        plt.show()
        plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png'))

        pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join(
            tmp_folder, "ridge_y.csv"),
                                                              index=False)
        pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"),
                                     index=False)

        pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder,
                                                    "input_columns.csv"),
                                       index=False)

        self.set_model_properties(model=xnn,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators'])