def test_featurize_single(): """Test featurization of single TS data file""" meta_feats = pred.parse_metadata_file( pjoin(DATA_PATH, "215153_metadata.dat")) res_dict = pred.featurize_single( pjoin(DATA_PATH, "dotastro_215153.dat"), ["std_err"], pjoin(DATA_PATH, "testfeature1.py"), meta_feats) assert all("std_err" in d["features_dict"] for fname, d in res_dict.items()) assert all("ts_data" in d for fname, d in res_dict.items())
def pred_featurize_reduce(iter, params): """Generate features as reduce step in Disco's map-reduce. Generator. Implementation of reduce stage in map-reduce process, for model prediction feature generation of time series data. This function is never directly called, but rather passed as a parameter to the Disco `Job()` object's `run()` method. Parameters ---------- iter : iterable Iterable of tuples each containing the file name of a time series data file to be used for featurization and an unused placeholder string. params : dict Dictionary of parameters for use in map-reduce process. Yields ------ tuple A two-element tuple containing the file name of the time series data set as its first element, and a two-element list containing the extracted features (dict) and the original time series data (list of lists) as its the second element. """ featset_key = params['featset_key'] custom_features_script = params['custom_features_script'] meta_features = params['meta_features'] import os from mltsp import cfg from mltsp import predict_class as pred import ntpath from disco.util import kvgroup for fname, junk in kvgroup(sorted(iter)): if fname[:7] == "file://": fname = fname.replace("file://", "") if os.path.isfile(fname): fpath = fname elif os.path.isfile(os.path.join(params["tmp_dir_path"], fname)): fpath = os.path.join(params["tmp_dir_path"], fname) elif os.path.isfile( os.path.join(os.path.join(cfg.UPLOAD_FOLDER, "unzipped"), fname)): fpath = os.path.join( os.path.join(cfg.UPLOAD_FOLDER, "unzipped"), fname) else: print((fname if cfg.UPLOAD_FOLDER in fname else os.path.join(cfg.UPLOAD_FOLDER, fname)) + " is not a file...") if (os.path.exists(os.path.join(cfg.UPLOAD_FOLDER, fname)) or os.path.exists(fname)): print("But it does exist on the disk.") else: print("and in fact it doesn't even exist.") continue features_to_use = pred.determine_feats_used(featset_key) big_feats_and_tsdata_dict = pred.featurize_single( fpath, features_to_use, custom_features_script, meta_features) try: os.remove(fpath) except Exception as e: print(e) short_fname = ntpath.basename(fpath).split("$")[0] all_features = big_feats_and_tsdata_dict[short_fname]["features_dict"] ts_data = big_feats_and_tsdata_dict[short_fname]["ts_data"] yield short_fname, [all_features, ts_data]