def __init__(self, path=None, text_kv=None, map_text2idx=None): self.key = [] self.text = [] self.map_text2idx = None if path is not None: with open(path) as f: all_texts = regex_key_val.findall(f.read()) _key, _text = list(zip(*all_texts)) self.key, self.text = list(_key), list(_text) for ii in range(len(self.text)): self.text[ii] = self.text[ii].split() else: assert text_kv is not None and isinstance(text_kv, list) for kv in text_kv: self.key.append(kv[0]) self.text.append(kv[1].split()) # additional feature # self.idx2key = dict([(x, y) for x, y in enumerate(self.key[:])]) self.key2idx = dict([(y, x) for x, y in enumerate(self.key[:])]) assert len(self.idx2key) == len(self.key2idx) # set map_text2idx if provided # if map_text2idx is not None: if isinstance(map_text2idx, str): map_text2idx = yaml.load(open(map_text2idx)) pass self.set_map_text2idx(map_text2idx) pass
def __init__(self, feat_path=None, feat_len_path=None, feat_kv=None, feat_len_kv=None, in_memory=False): """ feat_path : feat.scp (pair between key and feat path) feat_len_path : feat_len.scp (file pair between key and length) """ self.in_memory = in_memory if feat_kv is not None: # case 1 : path & len_path is dict list_kv = feat_kv list_klen = feat_len_kv elif feat_path is not None: assert os.path.exists(feat_path), "feat.scp is not exist" if feat_len_path is None: # automatically len_path feat_len_path = '{}_len{}'.format(*os.path.splitext(feat_path)) assert os.path.exists(feat_len_path), "feat_len.scp is not exist" self.feat_len_path = feat_len_path # read feat.scp list_kv = regex_key_val.findall(open(feat_path).read()) list_klen = regex_key_val.findall(open(feat_len_path).read()) else: raise ValueError() # create map self.key, self.feat_path = zip(*list_kv) _tmp_key, self.feat_len = zip(*list_klen) self.feat_len = [int(x) for x in self.feat_len] assert self.key == _tmp_key, "feat.scp key != feat_len.scp key" self.idx2key = dict([(x, y) for x, y in enumerate(self.key)]) self.key2idx = dict([(y, x) for x, y in enumerate(self.key)]) assert len(self.idx2key) == len(self.key2idx) if self.in_memory: self._load_feat_to_memory() pass
def parse(): parser = argparse.ArgumentParser(description='params') parser.add_argument('--data_path', type=str, required=True) parser.add_argument('--ncpu', type=int, default=16) return parser.parse_args() pass if __name__ == '__main__': args = parse() def get_feat_length(path): return np.load(path)['feat'].shape[0] folder = os.path.dirname(args.data_path) kv_list = regex_key_val.findall(open(args.data_path).read()) k_list, v_list = zip(*kv_list) with Pool(args.ncpu) as executor: # calculate max, min, mean, std # output_result = executor.map(get_feat_length, v_list) output_file = open( os.path.join(folder, '{}_len{}'.format(*os.path.splitext(args.data_path))), 'w') for k, v in zip(k_list, output_result): output_file.write('{} {}\n'.format(k, v)) output_file.close() pass
import os import argparse from utilbox.regex_util import regex_key_val def parse(): parser = argparse.ArgumentParser() parser.add_argument('--utt2spk', type=str) parser.add_argument('--spk', type=str) return parser.parse_args() pass if __name__ == '__main__': args = parse() list_kv = regex_key_val.findall(open(os.path.abspath(args.utt2spk)).read()) list_spk = set(open(args.spk).read().split('\n')) list_subset = [x[0] for x in list_kv if x[1] in list_spk] list_subset = sorted(list_subset) for k in list_subset: print(k) pass
def _read_key_val(path): return regex_key_val.findall(open(path).read())
from utilbox.regex_util import regex_key_val import argparse def parse(): parser = argparse.ArgumentParser() parser.add_argument('--files', type=str, nargs='+') return parser.parse_args() if __name__ == '__main__': args = parse() _main_kv = dict() for filename in args.files: _key_val = regex_key_val.findall(open(filename).read()) for k, v in _key_val: if k not in _main_kv: _main_kv[k] = v else: continue pass _main_kv = sorted(list(_main_kv.items()), key=lambda x: x[0]) print('\n'.join([x + ' ' + y for x, y in _main_kv]))
import sys from utilbox.regex_util import regex_key_val if __name__ == '__main__': set_global = None for fii in sys.argv[1:] : with open(fii) as f : list_kv = regex_key_val.findall(f.read()) if set_global is None : set_global = set([x[0] for x in list_kv]) else: set_global.intersection_update([x[0] for x in list_kv]) for fii in sys.argv[1:] : with open(fii) as f, open(fii+'.fix', 'w') as g : list_kv = regex_key_val.findall(f.read()) for k,v in list_kv : if k in set_global : g.write('{} {}\n'.format(k, v)) pass pass pass
CHUNKSIZE = 500 if __name__ == '__main__': args = parse() assert (args.scaler_type is not None) if os.path.exists(args.output): assert os.path.isdir(args.output), "output must be a folder" else: os.makedirs(args.output, mode=0o755, exist_ok=False) os.makedirs(os.path.join(args.output, 'meta'), mode=0o755, exist_ok=False) kv_list = open(args.data_path).read() kv_list = regex_key_val.findall(kv_list) # filter if key set exist # if args.set is not None: subset = regex_key.findall(open(args.set).read()) subset = set(subset) total_len = len(kv_list) kv_list = [x for x in kv_list if x[0] in subset] print('[info] select {}/{} from dataset'.format( len(kv_list), total_len)) k_list, v_list = zip(*kv_list) executor = ProcessPoolExecutor(max_workers=cpu_count()) if args.use_scaler is None: if args.scaler_type == 'meanstd':
'scaler', None) is not None else None # scaler_second = pickle.load(open(data_second_cfg['scaler'], 'rb'), encoding='latin1') if data_second_cfg.get('scaler',None) is not None else None # generate label text # # if not a file, then read as normal text input # list_kv = None if opts['text'] is not None: if not os.path.exists(opts['text']): text_list = [x.strip() for x in opts['text'].split('||')] key_list = ['text_{}'.format(ii) for ii in range(len(text_list))] list_kv = list(zip(key_list, text_list)) assert opts[ 'mode'] == 'pred', "free text input can't be used with teacher forcing mode" else: list_kv = regex_key_val.findall(open(opts['text']).read()) key_list, text_list = list(map(list, zip(*list_kv))) map_text2idx = json.load(open(data_cfg['text']['vocab'])) if model.TYPE == TacotronType.MULTI_SPEAKER: feat_spkvec_iterator = DataIteratorNP(data_cfg['misc']['spkvec']) ### generation process ### # create folder for generated result # if opts['path'] is None: tmpdir = tempfile.mkdtemp() os.makedirs(os.path.join(tmpdir, 'meta'), exist_ok=False) print('\nCreate temporary dir: {}'.format(tmpdir), file=sys.stderr) else: tmpdir = opts['path'] # if not exist, create new dir #
pass elif args.mode == 'pred': pass if args.wav_scp is None and args.feat_scp is None: logger.info('fallback ... read feat_scp from data_cfg') args.feat_scp = data_cfg['feat']['all'] if args.mode == 'tf': assert args.data_cfg is not None text_iterator = TextIterator(path=data_cfg['text']['all'], map_text2idx=map_text2idx) if args.wav_scp is not None: # list all wav files # list_key_wav = regex_key_val.findall(open(args.wav_scp).read()) if args.set is not None: if os.path.exists(args.set): list_key_wav = DataLoader._subset_data( list_key_wav, DataLoader._read_key(args.set)) else: args.set = args.set.split(' ') list_key_wav = DataLoader._subset_data(list_key_wav, args.set) list_key_wav = sorted(list_key_wav, key=lambda x: x[0]) # lazy load -- saving memory # def lazy_generate_feat(path, cfg): _feat = generate_feat_opts(path=path, cfg=cfg) if scaler is not None: _feat = scaler.transform(_feat) return _feat
for name in cleaner_names: cleaner = getattr(cleaners, name) if not cleaner: raise Exception('Unknown cleaner: %s' % name) text = cleaner(text) return text def _symbols_to_sequence(symbols): return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] def _arpabet_to_sequence(text): return _symbols_to_sequence(['@' + s for s in text.split()]) def _should_keep_symbol(s): return s in _symbol_to_id and s is not '_' and s is not '~' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--text', type=str, help='text file') parser.add_argument('--cleaners', type=str, nargs='+', help='cleaner type (english_cleaners)', default=['english cleaners']) args = parser.parse_args() with open(args.text) as f : all_lines = f.read() list_kv = regex_key_val.findall(all_lines) for k, v in list_kv : v_normalize = sequence_to_text(text_to_sequence(v, args.cleaners)) print('{} {}'.format(k, v_normalize))
from fastdtw import fastdtw from concurrent.futures import ProcessPoolExecutor from utilbox.regex_util import regex_key_val from scipy.spatial.distance import euclidean from tqdm import tqdm def parse() : parser = argparse.ArgumentParser(description='python wrapper for Kaldi\'s computer-wer') parser.add_argument('--ref', required=True, help='reference transcription (with key)') parser.add_argument('--hyp', required=True, help='hypothesis transcription (with key)') return parser.parse_args() pass if __name__ == '__main__': args = parse() list_kv_ref = regex_key_val.findall(open(args.ref).read()) list_kv_hyp = regex_key_val.findall(open(args.hyp).read()) list_key_ref = [x[0] for x in list_kv_ref] list_key_hyp = [x[0] for x in list_kv_hyp] assert list_key_ref == list_key_hyp, "the keys between refs & hyps are not same" dict_kv_ref = dict(list_kv_ref) dict_kv_hyp = dict(list_kv_hyp) total_dist = 0 total_count = 0 total_len = 0 for kk in tqdm(list_key_hyp, ncols=50) : v_ref = dict_kv_ref[kk] v_hyp = dict_kv_hyp[kk] _feat_ref = np.load(v_ref)['feat'] _feat_hyp = np.load(v_hyp)['feat']
import json import sys import os from utilbox.regex_util import regex_key_val if __name__ == '__main__': list_kv = regex_key_val.findall(sys.stdin.read()) print(json.dumps(dict(list_kv), indent=2)) pass