def featurize_reduce(iter, params): """Generate features as reduce step in Disco's map-reduce. Generator. Implementation of reduce stage in map-reduce process, for model prediction feature generation of time series data. This function is never directly called, but rather passed as a parameter to the Disco `Job()` object's `run()` method. Parameters ---------- iter : iterable Iterable of tuples each containing the file name of a time series data file to be used for featurization, and the associated class or type name. params : dict Dictionary of parameters for use in map-reduce process. Yields ------ tuple A two-element tuple containing the file name of the time series data set, and dict of the extracted features. """ from disco.util import kvgroup import ntpath from mltsp import featurize from mltsp import cfg for fname, class_name in kvgroup(sorted(iter)): if fname[:7] == "file://": fname = fname.replace("file://", "") class_names = [] for classname in class_name: class_names.append(classname) if len(class_names) == 1: class_name = str(class_names[0]) elif len(class_names) == 0: yield "", "" else: class_name = str(class_names[0]) short_fname = os.path.splitext(ntpath.basename(fname))[0].split("$")[0] path_to_csv = os.path.join(params['tmp_dir_path'], fname) if os.path.exists(path_to_csv): print("Extracting features for " + fname) all_features = featurize.featurize_tsdata_object( path_to_csv, short_fname, params['custom_script_path'], params['fname_class_dict_2'], params['features_to_use']) all_features["class"] = class_name yield short_fname, all_features else: print("*" * 10 + " " + path_to_csv + " doesn't exist on the disk.") yield "", ""
def test_featurize_tsdata_object(): """Test featurize TS data object function""" path_to_csv = pjoin(DATA_PATH, "dotastro_215153.dat") short_fname = featurize.shorten_fname(path_to_csv) custom_script_path = pjoin(cfg.UPLOAD_FOLDER, "testfeature1.py") fname_class_dict = {"dotastro_215153": "Mira"} features_to_use = ["std_err", "freq1_harmonics_freq_0"] all_feats = featurize.featurize_tsdata_object( path_to_csv, short_fname, custom_script_path, fname_class_dict, features_to_use) assert(isinstance(all_feats, dict)) assert("std_err" in all_feats) assert("freq1_harmonics_freq_0" in all_feats)