Exemple #1
0
def test_determine_feats_used():
    """Test determine_feats_used"""
    for suffix in ["features.csv", "classes.npy"]:
        shutil.copy(
            pjoin(DATA_PATH, "test_%s" % suffix),
            pjoin(cfg.FEATURES_FOLDER, "TEST001_%s" % suffix))
    feats_used = pred.determine_feats_used("TEST001")
    npt.assert_array_equal(feats_used, ["meta1", "meta2", "meta3",
                                        "std_err","amplitude"])
                                        
    for fname in ["TEST001_features.csv", "TEST001_classes.npy"]:
        os.remove(pjoin(cfg.FEATURES_FOLDER, fname))
def pred_featurize_reduce(iter, params):
    """Generate features as reduce step in Disco's map-reduce.

    Generator. Implementation of reduce stage in map-reduce process,
    for model prediction feature generation of time series data.

    This function is never directly called, but rather passed as a
    parameter to the Disco `Job()` object's `run()` method.

    Parameters
    ----------
    iter : iterable
        Iterable of tuples each containing the file name of a time
        series data file to be used for featurization and an unused
        placeholder string.
    params : dict
        Dictionary of parameters for use in map-reduce process.

    Yields
    ------
    tuple
        A two-element tuple containing the file name of the
        time series data set as its first element, and a two-element
        list containing the extracted features (dict) and the original
        time series data (list of lists) as its the second element.

    """
    featset_key = params['featset_key']
    custom_features_script = params['custom_features_script']
    meta_features = params['meta_features']

    import os

    from mltsp import cfg
    from mltsp import predict_class as pred
    import ntpath
    from disco.util import kvgroup

    for fname, junk in kvgroup(sorted(iter)):
        if fname[:7] == "file://":
            fname = fname.replace("file://", "")
        if os.path.isfile(fname):
            fpath = fname
        elif os.path.isfile(os.path.join(params["tmp_dir_path"], fname)):
            fpath = os.path.join(params["tmp_dir_path"], fname)
        elif os.path.isfile(
                os.path.join(os.path.join(cfg.UPLOAD_FOLDER, "unzipped"),
                             fname)):
            fpath = os.path.join(
                os.path.join(cfg.UPLOAD_FOLDER, "unzipped"), fname)
        else:
            print((fname if cfg.UPLOAD_FOLDER in fname else
                   os.path.join(cfg.UPLOAD_FOLDER, fname)) +
                  " is not a file...")
            if (os.path.exists(os.path.join(cfg.UPLOAD_FOLDER, fname)) or
                    os.path.exists(fname)):
                print("But it does exist on the disk.")
            else:
                print("and in fact it doesn't even exist.")
            continue

        features_to_use = pred.determine_feats_used(featset_key)
        big_feats_and_tsdata_dict = pred.featurize_single(
            fpath, features_to_use, custom_features_script, meta_features)

        try:
            os.remove(fpath)
        except Exception as e:
            print(e)
        short_fname = ntpath.basename(fpath).split("$")[0]
        all_features = big_feats_and_tsdata_dict[short_fname]["features_dict"]
        ts_data = big_feats_and_tsdata_dict[short_fname]["ts_data"]
        yield short_fname, [all_features, ts_data]