def featurize_in_parallel(headerfile_path, zipfile_path, features_to_use=[],
                          is_test=False, custom_script_path=None,
                          meta_features={}):
    """Generate features using Disco's map-reduce framework.

    Utilizes Disco's map-reduce framework to generate features on
    multiple time series data files in parallel. The generated
    features are returned, along with the time series data, in a
    dict (with file names as keys).

    Parameters
    ----------
    headerfile_path : str
        Path to header file containing file names, class names, and
        metadata.
    zipfile_path : str
        Path to the tarball of individual time series files to be used
        for feature generation.
    features_to_use : list, optional
        List of feature names to be generated. Default is an empty list,
        which results in all available features being used.
    is_test : bool, optional
        Boolean indicating whether to do a test run of only the first
        five time-series files. Defaults to False.
    custom_script_path : str, optional
        Path to Python script containing methods for the generation of
        any custom features.
    meta_features : dict, optional
        Dictionary of associated meta features, defaults to an empty
        dict.

    Returns
    -------
    dict
        Dictionary whose keys are the file names of the original time-
        series data and keys are dictionaries containing a dictionary
        of the features generated and a list of the time-series data.

    """
    session_key = str(uuid.uuid4())[:8]
    all_features_list = cfg.features_list[:] + cfg.features_list_science[:]

    if len(features_to_use) == 0:
        features_to_use = all_features_list

    if not os.path.exists(cfg.PROJECT_PATH_LINK):
        os.symlink(cfg.PROJECT_PATH, cfg.PROJECT_PATH_LINK)
    fname_class_dict = {}
    line_no = 0
    with open(headerfile_path) as headerfile:
        for line in headerfile:
            if len(line) > 1 and line[0] not in ["#", "\n"] and \
               line_no > 0 and not line.isspace():
                if len(line.split(',')) >= 2:
                    fname, class_name = line.strip('\n').split(',')[:2]
                    fname_class_dict[fname] = class_name
            line_no += 1
    tmp_dir_path = os.path.join("/tmp", str(uuid.uuid4())[:10])
    os.mkdir(tmp_dir_path)
    zipfile = tarfile.open(zipfile_path)
    zipfile.extractall(tmp_dir_path)
    all_fnames = zipfile.getnames()
    all_fnames = [f for f in all_fnames if not os.path.isdir(f)]
    if is_test:
        all_fnames = all_fnames[:3]

    orig_fnames_dict = {}
    tags = []
    for i in range(len(all_fnames)):
        short_fname = ntpath.basename(all_fnames[i])
        tags.append(str(session_key +
                        short_fname.replace(".", "_")))
        orig_fnames_dict[short_fname.replace(".", "_")] = short_fname
        if not os.path.isabs(all_fnames[i]):
            all_fnames[i] = os.path.join(tmp_dir_path, all_fnames[i])
    # Push all data files to DDFS
    disco_tools.push_all_objects(all_fnames, tags)

    print("Generating science features...")

    longfname_class_list = []
    for i in range(len(all_fnames)):
        short_fname = os.path.splitext(ntpath.basename(all_fnames[i]))[0]
        if short_fname in fname_class_dict:
            longfname_class_list.append([
                all_fnames[i], fname_class_dict[short_fname]])
        elif all_fnames[i] in fname_class_dict:
            longfname_class_list.append([
                all_fnames[i], fname_class_dict[all_fnames[i]]])

    params = {}
    params['fname_class_dict'] = fname_class_dict
    params['features_to_use'] = features_to_use
    params['meta_features'] = meta_features
    params['custom_script_path'] = custom_script_path
    params['tmp_dir_path'] = tmp_dir_path
    params['fname_class_dict_2'] = disco_tools.headerfile_to_fname_dict(
        headerfile_path)

    try:
        disco_results = process_featurization_with_disco(
            input_list=tags, params=params)
    except:
        raise
    finally:
        disco_tools.delete_pushed_objects(session_key)
    fname_features_dict = {}
    for k, v in disco_results:
        fname_features_dict[k] = v

    print("Done generating features.")
    for key, val in fname_features_dict.items():
        fname_features_dict[orig_fnames_dict[key]] = val
        del fname_features_dict[key]

    return fname_features_dict
def featurize_prediction_data_in_parallel(
        newpred_file_path, featset_key, sep=',',
        custom_features_script=None, meta_features={},
        tmp_dir_path="/tmp"):
    """Generate features using Disco's map-reduce framework.

    Utilizes Disco's map-reduce framework to generate features on
    multiple time series data files in parallel. The generated
    features are returned, along with the time series data, in a
    dict (with file names as keys).

    Parameters
    ----------
    newpred_file_path : str
        Path to the zip file containing time series data files to be
        featurized.
    featset_key : str
        RethinkDB key of the feature set associated with the model to
        be used in prediction.
    sep : str, optional
        Delimiting character in time series data files. Defaults to ",".
    custom_features_script : str, optional
        Path to custom features script to be used in feature
        generation. Defaults to None.
    meta_features : dict
        Dictionary of associated meta features. Defaults to an empty
        dict.
    tmp_dir_path : str, optional
        Path to temporary files directory, in which any temporary files
        will be created. Defaults to None, in which case temporary
        files are created in working directory, though they are later
        removed.

    Returns
    -------
    dict
        Dictionary whose keys are the file names of the original time-
        series data and keys are dictionaries containing a dictionary
        of the features generated and a list of the time-series data.

    """
    session_key = str(uuid.uuid4())[:8]
    the_tarfile = tarfile.open(newpred_file_path)
    the_tarfile.extractall(path=tmp_dir_path)
    all_fnames = the_tarfile.getnames()
    all_fnames = [f for f in all_fnames if not os.path.isdir(f)]

    orig_fnames_dict = {}
    tags = []
    for i in range(len(all_fnames)):
        short_fname = ntpath.basename(all_fnames[i])
        tags.append(str(session_key +
                        short_fname.replace(".", "_")))
        orig_fnames_dict[short_fname.replace(".", "_")] = short_fname
        if not os.path.isabs(all_fnames[i]):
            all_fnames[i] = os.path.join(tmp_dir_path, all_fnames[i])
    # Push all data files to DDFS
    disco_tools.push_all_objects(all_fnames, tags)

    if not os.path.exists(cfg.PROJECT_PATH_LINK):
        os.symlink(cfg.PROJECT_PATH, cfg.PROJECT_PATH_LINK)
    big_features_and_tsdata_dict = {}

    params = {"featset_key": featset_key, "sep": sep,
              "custom_features_script": custom_features_script,
              "meta_features": meta_features,
              "tmp_dir_path": tmp_dir_path}

    try:
        disco_iterator = process_prediction_data_featurization_with_disco(
            input_list=tags, params=params)
    except:
        raise
    finally:
        disco_tools.delete_pushed_objects(session_key)
    for k, v in disco_iterator:
        fname = k
        features_dict, ts_data = v
        if fname != "":
            big_features_and_tsdata_dict[fname] = {
                "features_dict": features_dict, "ts_data": ts_data}

    print("Feature generation complete.")
    for key, val in big_features_and_tsdata_dict.items():
        big_features_and_tsdata_dict[orig_fnames_dict[key]] = val
        del big_features_and_tsdata_dict[key]
    return big_features_and_tsdata_dict