def featurize_reduce(iter, params):
    """Generate features as reduce step in Disco's map-reduce.

    Generator. Implementation of reduce stage in map-reduce process,
    for model prediction feature generation of time series data.

    This function is never directly called, but rather passed as a
    parameter to the Disco `Job()` object's `run()` method.

    Parameters
    ----------
    iter : iterable
        Iterable of tuples each containing the file name of a time
        series data file to be used for featurization, and the
        associated class or type name.
    params : dict
        Dictionary of parameters for use in map-reduce process.

    Yields
    ------
    tuple
        A two-element tuple containing the file name of the time
        series data set, and dict of the extracted features.

    """
    from disco.util import kvgroup
    import ntpath
    from mltsp import featurize
    from mltsp import cfg

    for fname, class_name in kvgroup(sorted(iter)):
        if fname[:7] == "file://":
            fname = fname.replace("file://", "")
        class_names = []
        for classname in class_name:
            class_names.append(classname)
        if len(class_names) == 1:
            class_name = str(class_names[0])
        elif len(class_names) == 0:
            yield "", ""
        else:
            class_name = str(class_names[0])

        short_fname = os.path.splitext(ntpath.basename(fname))[0].split("$")[0]
        path_to_csv = os.path.join(params['tmp_dir_path'], fname)
        if os.path.exists(path_to_csv):
            print("Extracting features for " + fname)
            all_features = featurize.featurize_tsdata_object(
                path_to_csv, short_fname, params['custom_script_path'],
                params['fname_class_dict_2'], params['features_to_use'])
            all_features["class"] = class_name
            yield short_fname, all_features
        else:
            print("*" * 10 + " " + path_to_csv + " doesn't exist on the disk.")
            yield "", ""
Example #2
0
def test_featurize_tsdata_object():
    """Test featurize TS data object function"""
    path_to_csv = pjoin(DATA_PATH, "dotastro_215153.dat")
    short_fname = featurize.shorten_fname(path_to_csv)
    custom_script_path = pjoin(cfg.UPLOAD_FOLDER, "testfeature1.py")
    fname_class_dict = {"dotastro_215153": "Mira"}
    features_to_use = ["std_err", "freq1_harmonics_freq_0"]
    all_feats = featurize.featurize_tsdata_object(
        path_to_csv, short_fname, custom_script_path, fname_class_dict,
        features_to_use)
    assert(isinstance(all_feats, dict))
    assert("std_err" in all_feats)
    assert("freq1_harmonics_freq_0" in all_feats)