def write_history(self, *msg): with open(os.path.join(self._save_path, 'history.txt'), 'a+') as f: f.write("[%s]" % get_formatted_datetime(only_number=False)) for i, m in enumerate(msg): sep = " " if i == 0 else "\t" f.write("%s%s\n" % (sep, str(m))) return self
def __init__(self): super(ExtractorSignal, self).__init__() self._timestamp = get_formatted_datetime(only_number=False) self._extractor = None self._msg = '' self._action = 'ignore' self._last_input = {}
def get_config_path(self, cfg: DictConfig = None, datetime=False): output_path = self.get_output_path(cfg) if datetime: return os.path.join( output_path, 'configs_%s.yaml' % get_formatted_datetime(only_number=False)) return os.path.join(output_path, 'configs.yaml')
def fetch_exp_cfg(self, conditions={}, require_model=True) -> dict: r""" Arguments: require_model : a Boolean. If True, only return exp with saved model Return: A dictionary mapping from path to experiments and list of configs """ conditions = _prepare_conditions(conditions) def get_attr(c, name): if '.' in name: for key in name.split('.'): c = c.get(key) return c return c[name] # prepare the path path = self._save_path exp_path = [ os.path.join(path, name) for name in os.listdir(path) if 'exp_' == name[:4] ] # filter path with require_model if require_model: exp_path = list( filter(lambda x: os.path.isdir(os.path.join(x, 'model')), exp_path)) ret = {} for path in exp_path: cfg = sorted( [ os.path.join(path, i) for i in os.listdir(path) if 'configs_' == i[:8] ], key=lambda x: get_formatted_datetime( only_number=False, convert_text=x.split('_')[-1].split('.')[0]).timestamp()) if len(cfg) > 0: if len(conditions) > 0: last_cfg = cfg[-1] # lastest config with open(last_cfg, 'r') as f: last_cfg = OmegaConf.load(f) # filter the conditions if all( get_attr(last_cfg, key) in val for key, val in conditions.items()): ret[path] = cfg del last_cfg else: ret[path] = cfg return ret
def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str(get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret
def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str( get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret
def get_exp_path(system_name, args, override=False): """ Return: exp_dir, model_path, log_path """ exp_dir = get_exppath(tag='TIDIGITS_%s_%s_%s' % (system_name, args.task, args.feat)) if 'nmix' in args: exp_dir += '_%d' % args.nmix if 'tdim' in args: exp_dir += '_%d' % args.tdim # ====== check override ====== # if bool(override) and os.path.exists(exp_dir): shutil.rmtree(exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) # ====== basic paths ====== # model_path = os.path.join(exp_dir, 'model.ai') log_path = os.path.join( exp_dir, 'log_%s.txt' % get_formatted_datetime(only_number=True)) print("Exp dir:", ctext(exp_dir, 'cyan')) print("Model path:", ctext(model_path, 'cyan')) print("Log path:", ctext(log_path, 'cyan')) return exp_dir, model_path, log_path
def get_exp_path(system_name, args, override=False): """ Return: exp_dir, model_path, log_path """ exp_dir = get_exppath(tag='TIDIGITS_%s_%s_%s' % (system_name, args.task, args.feat)) if 'nmix' in args: exp_dir += '_%d' % args.nmix if 'tdim' in args: exp_dir += '_%d' % args.tdim # ====== check override ====== # if bool(override) and os.path.exists(exp_dir): shutil.rmtree(exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) # ====== basic paths ====== # model_path = os.path.join(exp_dir, 'model.ai') log_path = os.path.join(exp_dir, 'log_%s.txt' % get_formatted_datetime(only_number=True)) print("Exp dir:", ctext(exp_dir, 'cyan')) print("Model path:", ctext(model_path, 'cyan')) print("Log path:", ctext(log_path, 'cyan')) return exp_dir, model_path, log_path
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey( lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError( "Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError( "Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'." ) # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str( get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join( [s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)) flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError( "ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError( "Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write( '[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification( 'Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification( 'Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
from odin.stats import describe from helpers import (SCORING_DATASETS, BACKEND_DATASETS, SCORE_SYSTEM_NAME, SCORE_SYSTEM_ID, N_PLDA, N_LDA, PLDA_MAXIMUM_LIKELIHOOD, PLDA_SHOW_LLK, PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE, FEATURE_NAME, get_model_path, NCPU, get_logpath, prepare_dnn_feeder_recipe, sre_file_list, Config, EXP_DIR, VECTORS_DIR, RESULT_DIR, filter_utterances) # ====== scoring log ====== # stdio( get_logpath(name='make_score.log', increasing=True, odin_base=False, root=EXP_DIR)) print('=' * 48) print(get_formatted_datetime(only_number=False)) print("System name :", SCORE_SYSTEM_NAME) print("System id :", SCORE_SYSTEM_ID) print("Feature recipe :", FEATURE_RECIPE) print("Feature name :", FEATURE_NAME) print("Backend dataset:", ','.join(BACKEND_DATASETS.keys())) print("Scoring dataset:", ','.join(SCORING_DATASETS.keys())) print('=' * 48) # =========================================================================== # Some helper # =========================================================================== def _check_running_feature_extraction(feat_dir, n_files): # True mean need to run the feature extraction if not os.path.exists(feat_dir):
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey(lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError("Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError("Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'.") # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str(get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join([s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)); flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError("ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write('[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification('Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification('Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))