def featurize_in_parallel(headerfile_path, zipfile_path, features_to_use=[], is_test=False, custom_script_path=None, meta_features={}): """Generate features using Disco's map-reduce framework. Utilizes Disco's map-reduce framework to generate features on multiple time series data files in parallel. The generated features are returned, along with the time series data, in a dict (with file names as keys). Parameters ---------- headerfile_path : str Path to header file containing file names, class names, and metadata. zipfile_path : str Path to the tarball of individual time series files to be used for feature generation. features_to_use : list, optional List of feature names to be generated. Default is an empty list, which results in all available features being used. is_test : bool, optional Boolean indicating whether to do a test run of only the first five time-series files. Defaults to False. custom_script_path : str, optional Path to Python script containing methods for the generation of any custom features. meta_features : dict, optional Dictionary of associated meta features, defaults to an empty dict. Returns ------- dict Dictionary whose keys are the file names of the original time- series data and keys are dictionaries containing a dictionary of the features generated and a list of the time-series data. """ session_key = str(uuid.uuid4())[:8] all_features_list = cfg.features_list[:] + cfg.features_list_science[:] if len(features_to_use) == 0: features_to_use = all_features_list if not os.path.exists(cfg.PROJECT_PATH_LINK): os.symlink(cfg.PROJECT_PATH, cfg.PROJECT_PATH_LINK) fname_class_dict = {} line_no = 0 with open(headerfile_path) as headerfile: for line in headerfile: if len(line) > 1 and line[0] not in ["#", "\n"] and \ line_no > 0 and not line.isspace(): if len(line.split(',')) >= 2: fname, class_name = line.strip('\n').split(',')[:2] fname_class_dict[fname] = class_name line_no += 1 tmp_dir_path = os.path.join("/tmp", str(uuid.uuid4())[:10]) os.mkdir(tmp_dir_path) zipfile = tarfile.open(zipfile_path) zipfile.extractall(tmp_dir_path) all_fnames = zipfile.getnames() all_fnames = [f for f in all_fnames if not os.path.isdir(f)] if is_test: all_fnames = all_fnames[:3] orig_fnames_dict = {} tags = [] for i in range(len(all_fnames)): short_fname = ntpath.basename(all_fnames[i]) tags.append(str(session_key + short_fname.replace(".", "_"))) orig_fnames_dict[short_fname.replace(".", "_")] = short_fname if not os.path.isabs(all_fnames[i]): all_fnames[i] = os.path.join(tmp_dir_path, all_fnames[i]) # Push all data files to DDFS disco_tools.push_all_objects(all_fnames, tags) print("Generating science features...") longfname_class_list = [] for i in range(len(all_fnames)): short_fname = os.path.splitext(ntpath.basename(all_fnames[i]))[0] if short_fname in fname_class_dict: longfname_class_list.append([ all_fnames[i], fname_class_dict[short_fname]]) elif all_fnames[i] in fname_class_dict: longfname_class_list.append([ all_fnames[i], fname_class_dict[all_fnames[i]]]) params = {} params['fname_class_dict'] = fname_class_dict params['features_to_use'] = features_to_use params['meta_features'] = meta_features params['custom_script_path'] = custom_script_path params['tmp_dir_path'] = tmp_dir_path params['fname_class_dict_2'] = disco_tools.headerfile_to_fname_dict( headerfile_path) try: disco_results = process_featurization_with_disco( input_list=tags, params=params) except: raise finally: disco_tools.delete_pushed_objects(session_key) fname_features_dict = {} for k, v in disco_results: fname_features_dict[k] = v print("Done generating features.") for key, val in fname_features_dict.items(): fname_features_dict[orig_fnames_dict[key]] = val del fname_features_dict[key] return fname_features_dict
def featurize_prediction_data_in_parallel( newpred_file_path, featset_key, sep=',', custom_features_script=None, meta_features={}, tmp_dir_path="/tmp"): """Generate features using Disco's map-reduce framework. Utilizes Disco's map-reduce framework to generate features on multiple time series data files in parallel. The generated features are returned, along with the time series data, in a dict (with file names as keys). Parameters ---------- newpred_file_path : str Path to the zip file containing time series data files to be featurized. featset_key : str RethinkDB key of the feature set associated with the model to be used in prediction. sep : str, optional Delimiting character in time series data files. Defaults to ",". custom_features_script : str, optional Path to custom features script to be used in feature generation. Defaults to None. meta_features : dict Dictionary of associated meta features. Defaults to an empty dict. tmp_dir_path : str, optional Path to temporary files directory, in which any temporary files will be created. Defaults to None, in which case temporary files are created in working directory, though they are later removed. Returns ------- dict Dictionary whose keys are the file names of the original time- series data and keys are dictionaries containing a dictionary of the features generated and a list of the time-series data. """ session_key = str(uuid.uuid4())[:8] the_tarfile = tarfile.open(newpred_file_path) the_tarfile.extractall(path=tmp_dir_path) all_fnames = the_tarfile.getnames() all_fnames = [f for f in all_fnames if not os.path.isdir(f)] orig_fnames_dict = {} tags = [] for i in range(len(all_fnames)): short_fname = ntpath.basename(all_fnames[i]) tags.append(str(session_key + short_fname.replace(".", "_"))) orig_fnames_dict[short_fname.replace(".", "_")] = short_fname if not os.path.isabs(all_fnames[i]): all_fnames[i] = os.path.join(tmp_dir_path, all_fnames[i]) # Push all data files to DDFS disco_tools.push_all_objects(all_fnames, tags) if not os.path.exists(cfg.PROJECT_PATH_LINK): os.symlink(cfg.PROJECT_PATH, cfg.PROJECT_PATH_LINK) big_features_and_tsdata_dict = {} params = {"featset_key": featset_key, "sep": sep, "custom_features_script": custom_features_script, "meta_features": meta_features, "tmp_dir_path": tmp_dir_path} try: disco_iterator = process_prediction_data_featurization_with_disco( input_list=tags, params=params) except: raise finally: disco_tools.delete_pushed_objects(session_key) for k, v in disco_iterator: fname = k features_dict, ts_data = v if fname != "": big_features_and_tsdata_dict[fname] = { "features_dict": features_dict, "ts_data": ts_data} print("Feature generation complete.") for key, val in big_features_and_tsdata_dict.items(): big_features_and_tsdata_dict[orig_fnames_dict[key]] = val del big_features_and_tsdata_dict[key] return big_features_and_tsdata_dict