def __init__(self, **kwargs): super().__init__(**kwargs) import nltk nltk_data_path = os.path.join(config.data_directory, config.contrib_env_relative_directory, "nltk_data") nltk_temp_path = os.path.join(temporary_files_path, "nltk_data") nltk.data.path.append(nltk_data_path) try: self.pos_tagger = nltk.pos_tag self.pos_tagger("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tagger_path = os.path.join(nltk_data_path, "taggers") os.makedirs(tagger_path, exist_ok=True) file1 = download("https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip", dest_path=nltk_temp_path) file2 = download("https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tagger_path) self.unzip_file(file2, tagger_path) self.atomic_move(file1, tagger_path) self.atomic_move(file2, tagger_path) self.pos_tagger = nltk.pos_tag self.pos_tagger("test")
def set_tagger(self): import nltk nltk_data_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "nltk_data") nltk_temp_path = os.path.join(user_dir(), "nltk_data") nltk.data.path.append(nltk_data_path) nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path) try: self.pos_tagger = nltk.pos_tag self.pos_tagger("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tagger_path = os.path.join(nltk_data_path, "taggers") os.makedirs(tagger_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip", dest_path=nltk_temp_path) file2 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tagger_path) self.unzip_file(file2, tagger_path) self.atomic_copy(file1, tagger_path) self.atomic_copy(file2, tagger_path) self.pos_tagger = nltk.pos_tag self.pos_tagger("test")
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "airlines") os.makedirs(temp_path, exist_ok=True) link = AirlinesData.base_url + "1990.csv.bz2" file = download(link, dest_path=temp_path) output_file1 = file.replace(".bz2", "") print("%s %s" % (file, output_file1)) extract_bz2(file, output_file1) link = AirlinesData.base_url + "1991.csv.bz2" file = download(link, dest_path=temp_path) output_file2 = file.replace(".bz2", "") print("%s %s" % (file, output_file2)) extract_bz2(file, output_file2) return [output_file1, output_file2]
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # Download files # Location in DAI file system where we will save the data set temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz" link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz" link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz" # Download the files file_basics = download(link_basics, dest_path=temp_path) file_ratings = download(link_ratings, dest_path=temp_path) file_episodes = download(link_episodes, dest_path=temp_path) # get COVID19 new cases data from Our World in Data github basics = dt.fread(file_basics, fill=True) ratings = dt.fread(file_ratings, fill=True) episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True) # remove files os.remove(file_basics) os.remove(file_ratings) os.remove(file_episodes) # Create Title with Ratings dataset # join titles with non-null ratings ratings = ratings[~dt.isna(dt.f.averageRating), :] ratings.key = "tconst" basics_ratings = basics[:, :, dt.join(ratings)] # Create Episodes dataset episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :] episode_ratings = episodes[:, :, dt.join(ratings)] episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'} basics_ratings.key = 'tconst' title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)] # enumerate series episodes from 1 to N title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)] result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list() from itertools import chain cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0]) title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount)) # return datasets return {f"imdb_title_ratings": basics_ratings, f"imdb_episode_ratings": title_episode_ratings}
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data_file = zipfile.read() open(output_file, 'wb').write(data_file) temp_path = os.path.join(config.data_directory, config.contrib_relative_directory, "airlines") os.makedirs(temp_path, exist_ok=True) link = "http://stat-computing.org/dataexpo/2009/1987.csv.bz2" file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") print("%s %s" % (file, output_file)) extract_bz2(file, output_file) return output_file
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config # Location in DAI file system where we will save the data set temp_path = os.path.join(config.data_directory, config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link = "https://data.seattle.gov/resource/rdtp-hzy3.csv" # Download the file file = download(link, dest_path=temp_path) # Give the file a descriptive name for the UI output_file = file.replace("rdtp-hzy3", "seattle_monthly_rain_raw") os.rename(file, output_file) # Return the location on the DAI server for this data set return output_file
def create_data(X: dt.Frame = None) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4())) os.makedirs(temp_path, exist_ok=True) link = "http://data.un.org/_Docs/SYB/CSV/SYB63_226_202009_Net%20Disbursements%20from%20Official%20ODA%20to%20Recipients.csv" output_file1 = download(link, dest_path=temp_path) link = "http://data.un.org/_Docs/SYB/CSV/SYB63_223_202009_Net%20Disbursements%20from%20Official%20ODA%20from%20Donors.csv" output_file2 = download(link, dest_path=temp_path) return [output_file1, output_file2]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4())) os.makedirs(temp_path, exist_ok=True) link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" output_file1 = download(link, dest_path=temp_path) link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data" output_file2 = download(link, dest_path=temp_path) return [output_file1, output_file2]
def create_data(X: dt.Frame = None) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4())) os.makedirs(temp_path, exist_ok=True) link = TestData.url file = download(link, dest_path=temp_path) return file
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines") os.makedirs(temp_path, exist_ok=True) dt.options.nthreads = 8 # specify which years are used for training and testing training = list(range(2005, 2008)) testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = AirlinesData.base_url + "%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") if not os.path.exists(output_file): extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins) flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # select flights leaving from SFO only X = X[dt.f['Origin'] == 'SFO', :] # Fill NaNs in DepDelay column X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 # create binary target column depdelay_threshold_mins = 15 target = 'DepDelay%dm' % depdelay_threshold_mins X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins cols_to_keep.extend([ target, 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download( 'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = True if not split: filename = os.path.join( temp_path, "flight_delays_data_recipe_%d-%d.csv" % (min(training), max(testing))) X.to_csv(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join( temp_path, "augmented_flights_%s-%d_%s.csv" % (X_split[:, 'Year'].min1(), X_split[:, 'Year'].max1(), name)) X_split.to_csv(filename) output_files.append(filename) return output_files
def __init__(self, **kwargs): super().__init__(**kwargs) self.do_stemming = True # turn off as needed self.do_lemmatization = True # turn off as needed self.remove_stopwords = True # turn off as needed import nltk nltk_data_path = os.path.join(config.data_directory, config.contrib_env_relative_directory, "nltk_data") nltk_temp_path = os.path.join(temporary_files_path, "nltk_data") nltk.data.path.append(nltk_data_path) # download resources for stemming if needed if self.do_stemming: try: self.stemmer = nltk.stem.porter.PorterStemmer() self.stemmer.stem("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tokenizer_path = os.path.join(nltk_data_path, "tokenizers") os.makedirs(tokenizer_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tokenizer_path) self.atomic_move(file1, tokenizer_path) self.stemmer = nltk.stem.porter.PorterStemmer() self.stemmer.stem("test") # download resources for lemmatization if needed if self.do_lemmatization: try: from nltk.corpus import wordnet self.lemmatizer = nltk.stem.WordNetLemmatizer() self.pos_tagger = nltk.pos_tag self.lemmatizer.lemmatize("test", wordnet.NOUN) self.pos_tagger("test") except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) tagger_path = os.path.join(nltk_data_path, "taggers") corpora_path = os.path.join(nltk_data_path, "corpora") os.makedirs(tagger_path, exist_ok=True) os.makedirs(corpora_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip", dest_path=nltk_temp_path) file2 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip", dest_path=nltk_temp_path) file3 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip", dest_path=nltk_temp_path) self.unzip_file(file1, tagger_path) self.unzip_file(file2, tagger_path) self.unzip_file(file3, corpora_path) self.atomic_move(file1, tagger_path) self.atomic_move(file2, tagger_path) self.atomic_move(file3, corpora_path) from nltk.corpus import wordnet self.lemmatizer = nltk.stem.WordNetLemmatizer() self.pos_tagger = nltk.pos_tag self.lemmatizer.lemmatize("test", wordnet.NOUN) self.pos_tagger("test") self.wordnet_map = { "N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV, "O": wordnet.NOUN } # download resources for stopwords if needed if self.remove_stopwords: try: self.stopwords = set(nltk.corpus.stopwords.words('english')) except LookupError: os.makedirs(nltk_data_path, exist_ok=True) os.makedirs(nltk_temp_path, exist_ok=True) corpora_path = os.path.join(nltk_data_path, "corpora") os.makedirs(corpora_path, exist_ok=True) file1 = download( "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip", dest_path=nltk_temp_path) self.unzip_file(file1, corpora_path) self.atomic_move(file1, corpora_path) self.stopwords = set(nltk.corpus.stopwords.words('english'))
def _setup_recipe(): # for DAI 1.7.0 one is required to run this function manually # in DAI >=1.7.1, this function will be run by DAI itself import os from h2oaicore.systemutils_more import extract, download from h2oaicore.systemutils import config, remove from h2oaicore.systemutils import user_dir import shutil from h2oaicore.systemutils_more import arch_type # don't remove this import, setup_recipe parsed-out separately return True # WIP: Disable daal for now in general, just leave recipe floating there for migration purposes if arch_type == "ppc64le": if config.hard_asserts: # in CI testing just ignore return True else: # for user use, raise raise RuntimeError("Cannot use daal on PPC") daal_is_installed_path = os.path.join( user_dir(), config.contrib_env_relative_directory, "daal") daal_is_installed_file = os.path.join(daal_is_installed_path, "daal_is_installed") if not os.path.isfile(daal_is_installed_file): daal_temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "daal") os.makedirs(daal_temp_path, exist_ok=True) prefix = "https://anaconda.org/intel" try: file1 = download( "%s/daal4py/2021.2.0/download/linux-64/daal4py-2021.2.0-py38_intel_358.tar.bz2" % prefix, dest_path=daal_temp_path) file2 = download( "%s/impi_rt/2021.2.0/download/linux-64/impi_rt-2021.2.0-intel_215.tar.bz2" % prefix, dest_path=daal_temp_path) file3 = download( "%s/daal/2021.2.0/download/linux-64/daal-2021.2.0-intel_358.tar.bz2" % prefix, dest_path=daal_temp_path) file4 = download( "https://github.com/intel/daal/releases/download/2019_u4/l_daal_oss_p_2019.4.007.tgz", dest_path=daal_temp_path) except: file1 = download( "https://0xdata-public.s3.amazonaws.com/daal4py-2019.4-py36h7b7c402_6.tar.bz2", dest_path=daal_temp_path) file2 = download( "https://0xdata-public.s3.amazonaws.com/impi_rt-2019.4-intel_243.tar.bz2", dest_path=daal_temp_path) file3 = download( "https://0xdata-public.s3.amazonaws.com/daal-2019.4-intel_243.tar.bz2", dest_path=daal_temp_path) file4 = download( "https://0xdata-public.s3.amazonaws.com/l_daal_oss_p_2019.4.007.tgz", dest_path=daal_temp_path) temp_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "info") os.makedirs(temp_path, exist_ok=True) python_site_packages_path = os.path.join( user_dir(), config.contrib_env_relative_directory) extract(file1, python_site_packages_path) python_site_packages_path2 = os.path.join( user_dir(), config.contrib_env_relative_directory) extract(file2, python_site_packages_path2) extract(file3, python_site_packages_path2) extract(file4, python_site_packages_path2, "gz") other_path = os.path.join(python_site_packages_path2, "lib/libfabric/") import glob for file in glob.glob(os.path.join(other_path, "*.so*")): new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file)) if not os.path.isfile(new_file): shutil.copy(file, new_file) other_path = os.path.join( python_site_packages_path2, "l_daal_oss_p_2019.4.007/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/" ) import glob for file in glob.glob(os.path.join(other_path, "*.so*")): new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file)) if not os.path.isfile(new_file): shutil.copy(file, new_file) os.makedirs(daal_is_installed_path, exist_ok=True) with open(daal_is_installed_file, "wt") as f: f.write("DONE") remove(file1) remove(file2) remove(file3) remove(file4) return True
def _setup_recipe(): # for DAI 1.7.0 one is required to run this function manually # in DAI >=1.7.1, this function will be run by DAI itself import os from h2oaicore.systemutils_more import extract, download from h2oaicore.systemutils import config import shutil daal_is_installed_path = os.path.join( config.data_directory, config.contrib_env_relative_directory, "daal") daal_is_installed_file = os.path.join(daal_is_installed_path, "daal_is_installed") if not os.path.isfile(daal_is_installed_file): daal_temp_path = os.path.join(config.data_directory, config.contrib_relative_directory, "daal") os.makedirs(daal_temp_path, exist_ok=True) prefix = "https://anaconda.org/intel" file1 = download( "%s/daal4py/2019.4/download/linux-64/daal4py-2019.4-py36h7b7c402_6.tar.bz2" % prefix, dest_path=daal_temp_path) file2 = download( "%s/impi_rt/2019.4/download/linux-64/impi_rt-2019.4-intel_243.tar.bz2" % prefix, dest_path=daal_temp_path) file3 = download( "%s/daal/2019.4/download/linux-64/daal-2019.4-intel_243.tar.bz2" % prefix, dest_path=daal_temp_path) file4 = download( "https://github.com/intel/daal/releases/download/2019_u1.1/l_daal_oss_p_2019.1.004.tgz", dest_path=daal_temp_path) temp_path = os.path.join(config.data_directory, config.contrib_env_relative_directory, "info") os.makedirs(temp_path, exist_ok=True) python_site_packages_path = os.path.join( config.data_directory, config.contrib_env_relative_directory) extract(file1, python_site_packages_path) python_site_packages_path2 = os.path.join( config.data_directory, config.contrib_env_relative_directory) extract(file2, python_site_packages_path2) extract(file3, python_site_packages_path2) extract(file4, python_site_packages_path2, "gz") other_path = os.path.join(python_site_packages_path2, "lib/libfabric/") import glob for file in glob.glob(os.path.join(other_path, "*.so*")): new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file)) if not os.path.isfile(new_file): shutil.copy(file, new_file) other_path = os.path.join( python_site_packages_path2, "l_daal_oss_p_2019.1.004/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/" ) import glob for file in glob.glob(os.path.join(other_path, "*.so*")): new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file)) if not os.path.isfile(new_file): shutil.copy(file, new_file) os.makedirs(daal_is_installed_path, exist_ok=True) with open(daal_is_installed_file, "wt") as f: f.write("DONE") return True
def create_data(X: dt.Frame = None): file = download(url=path_to_zip, dest_path=tmp_dir) with ZipFile(file, 'r') as zip_ref: zip_ref.extractall(tmp_dir) num_id_cols = 6 main_data = dt.fread( os.path.join(tmp_dir, "sales_train_evaluation.csv")) all_cols = list(main_data.names) id_cols = all_cols[:num_id_cols] date_cols = all_cols[num_id_cols + 1125:] # training data target = "target" data = pd.melt(main_data.to_pandas(), id_vars=id_cols, value_vars=date_cols, var_name="d", value_name=target) data[target] = data[target].astype(float) data = dt.Frame(data) data_splits = [data] names = ["m5_train"] # test data for submission submission = dt.fread(os.path.join(tmp_dir, "sample_submission.csv")) for name, ranges in holdout_splits.items(): test_cls = ["d_" + str(k) for k in ranges] test_data = [] ids = submission["id"].to_list()[0] new_test_cols = ["d"] + id_cols for i in range(len(ids)): id = ids[i] splits = ids[i].split("_") item_id = splits[0] + "_" + splits[1] + "_" + splits[2] dept_id = splits[0] + "_" + splits[1] cat_id = splits[0] store_id = splits[3] + "_" + splits[4] state_id = splits[3] id_values = [id, item_id, dept_id, cat_id, store_id, state_id] for j in range(len(test_cls)): row_values = [test_cls[j]] + id_values test_data.append(row_values) test_data = pd.DataFrame(test_data, columns=new_test_cols) test_data = dt.Frame(test_data) data_splits.append(test_data) names.append(name) weather_data = dt.fread(os.path.join(tmp_dir, "calendar.csv")) weather_data.key = "d" price_data = dt.fread(os.path.join(tmp_dir, "sell_prices.csv")) price_data.key = ["store_id", "item_id", "wm_yr_wk"] ret = OrderedDict() for n, f in zip(names, data_splits): f = f[:, :, dt.join(weather_data)] f = f[:, :, dt.join(price_data)] ret[n] = f return ret
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: # import packages import os import gc from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import zipfile # define constants train_data_url = "https://files.slack.com/files-pri/T0329MHH6-F012UF3T2J0/download/bosch_train_full.zip?pub_secret=c59d0f381a" test_data_url = "https://files.slack.com/files-pri/T0329MHH6-F013ES4F6N4/download/bosch_test_full.zip?pub_secret=8726e8b7e2" # function for unzipping data def extract_zip(file, output_directory): with zipfile.ZipFile(file, "r") as zip_ref: zip_ref.extractall(output_directory) # download and unzip files temp_path = os.path.join(config.data_directory, "recipe_tmp", "bosch") os.makedirs(temp_path, exist_ok=True) for link in [train_data_url, test_data_url]: raw_file = download(link, dest_path=temp_path) extract_zip(raw_file, temp_path) # parse with datatable train_path = os.path.join(temp_path, "bosch_train_full.csv") test_path = os.path.join(temp_path, "bosch_test_full.csv") X_train = dt.fread(train_path) X_test = dt.fread(test_path) # add leak features train = X_train[:, ["Id", "Response"]].to_pandas() test = X_test[:, ["Id"]].to_pandas() date_features = [colname for colname in X_test.names if "D" in colname] train["Min_Date"] = X_train[:, date_features].to_pandas().min( axis=1).values test["Min_Date"] = X_test[:, date_features].to_pandas().min(axis=1).values ntrain = train.shape[0] train_test = pd.concat([train, test]).reset_index(drop=True) train_test.sort_values(by=["Min_Date", "Id"], ascending=True, inplace=True) train_test["Leak_1"] = train_test["Id"].diff() train_test["Leak_2"] = train_test["Id"].iloc[::-1].diff() train_test["Leak_3"] = train_test["Response"].shift(1) train_test["Leak_4"] = train_test["Response"].shift(-1) train_test = dt.Frame(train_test.drop("Response", axis=1)) train_test.key = "Id" X_train = X_train[:, :, dt.join(train_test)] X_test = X_test[:, :, dt.join(train_test)] return {"bosch_train_leak": X_train, "bosch_test_leak": X_test}
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(config.data_directory, config.contrib_relative_directory, "airlines") os.makedirs(temp_path, exist_ok=True) # specify which years are used for training and testing training = [2007] testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = "http://stat-computing.org/dataexpo/2009/%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s' % name flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # Fill NaNs with 0s X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 cols_to_keep.extend([ 'DepDelay', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download('http://stat-computing.org/dataexpo/2009/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = False if not split: filename = os.path.join( temp_path, "flight_delays_regression_%d-%d.jay" % (min(training), max(testing))) X.to_jay(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join(temp_path, "flight_delays_%s.jay" % name) X_split.to_jay(filename) output_files.append(filename) return output_files