Ejemplo n.º 1
0
    def __init__(self, path=None, text_kv=None, map_text2idx=None):
        self.key = []
        self.text = []
        self.map_text2idx = None
        if path is not None:
            with open(path) as f:
                all_texts = regex_key_val.findall(f.read())
                _key, _text = list(zip(*all_texts))
                self.key, self.text = list(_key), list(_text)
                for ii in range(len(self.text)):
                    self.text[ii] = self.text[ii].split()
        else:
            assert text_kv is not None and isinstance(text_kv, list)
            for kv in text_kv:
                self.key.append(kv[0])
                self.text.append(kv[1].split())

        # additional feature #
        self.idx2key = dict([(x, y) for x, y in enumerate(self.key[:])])
        self.key2idx = dict([(y, x) for x, y in enumerate(self.key[:])])
        assert len(self.idx2key) == len(self.key2idx)

        # set map_text2idx if provided #
        if map_text2idx is not None:
            if isinstance(map_text2idx, str):
                map_text2idx = yaml.load(open(map_text2idx))
                pass
            self.set_map_text2idx(map_text2idx)
        pass
Ejemplo n.º 2
0
 def __init__(self,
              feat_path=None,
              feat_len_path=None,
              feat_kv=None,
              feat_len_kv=None,
              in_memory=False):
     """
     feat_path : feat.scp (pair between key and feat path)
     feat_len_path : feat_len.scp (file pair between key and length)
     """
     self.in_memory = in_memory
     if feat_kv is not None:
         # case 1 : path & len_path is dict
         list_kv = feat_kv
         list_klen = feat_len_kv
     elif feat_path is not None:
         assert os.path.exists(feat_path), "feat.scp is not exist"
         if feat_len_path is None:
             # automatically len_path
             feat_len_path = '{}_len{}'.format(*os.path.splitext(feat_path))
         assert os.path.exists(feat_len_path), "feat_len.scp is not exist"
         self.feat_len_path = feat_len_path
         # read feat.scp
         list_kv = regex_key_val.findall(open(feat_path).read())
         list_klen = regex_key_val.findall(open(feat_len_path).read())
     else:
         raise ValueError()
     # create map
     self.key, self.feat_path = zip(*list_kv)
     _tmp_key, self.feat_len = zip(*list_klen)
     self.feat_len = [int(x) for x in self.feat_len]
     assert self.key == _tmp_key, "feat.scp key != feat_len.scp key"
     self.idx2key = dict([(x, y) for x, y in enumerate(self.key)])
     self.key2idx = dict([(y, x) for x, y in enumerate(self.key)])
     assert len(self.idx2key) == len(self.key2idx)
     if self.in_memory:
         self._load_feat_to_memory()
     pass
Ejemplo n.º 3
0
def parse():
    parser = argparse.ArgumentParser(description='params')
    parser.add_argument('--data_path', type=str, required=True)
    parser.add_argument('--ncpu', type=int, default=16)
    return parser.parse_args()
    pass


if __name__ == '__main__':
    args = parse()

    def get_feat_length(path):
        return np.load(path)['feat'].shape[0]

    folder = os.path.dirname(args.data_path)
    kv_list = regex_key_val.findall(open(args.data_path).read())
    k_list, v_list = zip(*kv_list)

    with Pool(args.ncpu) as executor:
        # calculate max, min, mean, std #
        output_result = executor.map(get_feat_length, v_list)

    output_file = open(
        os.path.join(folder,
                     '{}_len{}'.format(*os.path.splitext(args.data_path))),
        'w')
    for k, v in zip(k_list, output_result):
        output_file.write('{} {}\n'.format(k, v))
    output_file.close()
    pass
Ejemplo n.º 4
0
import os
import argparse
from utilbox.regex_util import regex_key_val


def parse():
    parser = argparse.ArgumentParser()
    parser.add_argument('--utt2spk', type=str)
    parser.add_argument('--spk', type=str)
    return parser.parse_args()
    pass


if __name__ == '__main__':
    args = parse()
    list_kv = regex_key_val.findall(open(os.path.abspath(args.utt2spk)).read())
    list_spk = set(open(args.spk).read().split('\n'))
    list_subset = [x[0] for x in list_kv if x[1] in list_spk]
    list_subset = sorted(list_subset)
    for k in list_subset:
        print(k)
    pass
Ejemplo n.º 5
0
 def _read_key_val(path):
     return regex_key_val.findall(open(path).read())
Ejemplo n.º 6
0
from utilbox.regex_util import regex_key_val
import argparse


def parse():
    parser = argparse.ArgumentParser()
    parser.add_argument('--files', type=str, nargs='+')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse()
    _main_kv = dict()
    for filename in args.files:
        _key_val = regex_key_val.findall(open(filename).read())
        for k, v in _key_val:
            if k not in _main_kv:
                _main_kv[k] = v
            else:
                continue
        pass
    _main_kv = sorted(list(_main_kv.items()), key=lambda x: x[0])
    print('\n'.join([x + ' ' + y for x, y in _main_kv]))
Ejemplo n.º 7
0
import sys
from utilbox.regex_util import regex_key_val

if __name__ == '__main__':
    set_global = None
    for fii in sys.argv[1:] :
        with open(fii) as f :
            list_kv = regex_key_val.findall(f.read())
            if set_global is None :
                set_global = set([x[0] for x in list_kv])
            else:
                set_global.intersection_update([x[0] for x in list_kv])

    for fii in sys.argv[1:] :
        with open(fii) as f, open(fii+'.fix', 'w') as g :
            list_kv = regex_key_val.findall(f.read())

            for k,v in list_kv :
                if k in set_global :
                    g.write('{} {}\n'.format(k, v))
            pass
        pass
    pass
Ejemplo n.º 8
0
CHUNKSIZE = 500

if __name__ == '__main__':
    args = parse()
    assert (args.scaler_type is not None)

    if os.path.exists(args.output):
        assert os.path.isdir(args.output), "output must be a folder"
    else:
        os.makedirs(args.output, mode=0o755, exist_ok=False)
        os.makedirs(os.path.join(args.output, 'meta'),
                    mode=0o755,
                    exist_ok=False)

    kv_list = open(args.data_path).read()
    kv_list = regex_key_val.findall(kv_list)

    # filter if key set exist #
    if args.set is not None:
        subset = regex_key.findall(open(args.set).read())
        subset = set(subset)
        total_len = len(kv_list)
        kv_list = [x for x in kv_list if x[0] in subset]
        print('[info] select {}/{} from dataset'.format(
            len(kv_list), total_len))

    k_list, v_list = zip(*kv_list)

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    if args.use_scaler is None:
        if args.scaler_type == 'meanstd':
Ejemplo n.º 9
0
                                  'scaler', None) is not None else None

    # scaler_second = pickle.load(open(data_second_cfg['scaler'], 'rb'), encoding='latin1') if data_second_cfg.get('scaler',None) is not None else None

    # generate label text #
    # if not a file, then read as normal text input #
    list_kv = None
    if opts['text'] is not None:
        if not os.path.exists(opts['text']):
            text_list = [x.strip() for x in opts['text'].split('||')]
            key_list = ['text_{}'.format(ii) for ii in range(len(text_list))]
            list_kv = list(zip(key_list, text_list))
            assert opts[
                'mode'] == 'pred', "free text input can't be used with teacher forcing mode"
        else:
            list_kv = regex_key_val.findall(open(opts['text']).read())
            key_list, text_list = list(map(list, zip(*list_kv)))

    map_text2idx = json.load(open(data_cfg['text']['vocab']))
    if model.TYPE == TacotronType.MULTI_SPEAKER:
        feat_spkvec_iterator = DataIteratorNP(data_cfg['misc']['spkvec'])

    ### generation process ###
    # create folder for generated result #
    if opts['path'] is None:
        tmpdir = tempfile.mkdtemp()
        os.makedirs(os.path.join(tmpdir, 'meta'), exist_ok=False)
        print('\nCreate temporary dir: {}'.format(tmpdir), file=sys.stderr)
    else:
        tmpdir = opts['path']
        # if not exist, create new dir #
Ejemplo n.º 10
0
        pass
    elif args.mode == 'pred':
        pass

    if args.wav_scp is None and args.feat_scp is None:
        logger.info('fallback ... read feat_scp from data_cfg')
        args.feat_scp = data_cfg['feat']['all']

    if args.mode == 'tf':
        assert args.data_cfg is not None
        text_iterator = TextIterator(path=data_cfg['text']['all'],
                                     map_text2idx=map_text2idx)

    if args.wav_scp is not None:
        # list all wav files #
        list_key_wav = regex_key_val.findall(open(args.wav_scp).read())
        if args.set is not None:
            if os.path.exists(args.set):
                list_key_wav = DataLoader._subset_data(
                    list_key_wav, DataLoader._read_key(args.set))
            else:
                args.set = args.set.split(' ')
                list_key_wav = DataLoader._subset_data(list_key_wav, args.set)
        list_key_wav = sorted(list_key_wav, key=lambda x: x[0])

        # lazy load -- saving memory #
        def lazy_generate_feat(path, cfg):
            _feat = generate_feat_opts(path=path, cfg=cfg)
            if scaler is not None:
                _feat = scaler.transform(_feat)
            return _feat
Ejemplo n.º 11
0
    for name in cleaner_names:
        cleaner = getattr(cleaners, name)
        if not cleaner:
            raise Exception('Unknown cleaner: %s' % name)
        text = cleaner(text)
    return text


def _symbols_to_sequence(symbols):
    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]


def _arpabet_to_sequence(text):
    return _symbols_to_sequence(['@' + s for s in text.split()])


def _should_keep_symbol(s):
    return s in _symbol_to_id and s is not '_' and s is not '~'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--text', type=str, help='text file')
    parser.add_argument('--cleaners', type=str, nargs='+', help='cleaner type (english_cleaners)', default=['english cleaners'])
    args = parser.parse_args()
    with open(args.text) as f :
        all_lines = f.read()
        list_kv = regex_key_val.findall(all_lines)
        for k, v in list_kv :
            v_normalize = sequence_to_text(text_to_sequence(v, args.cleaners))
            print('{} {}'.format(k, v_normalize))
Ejemplo n.º 12
0
from fastdtw import fastdtw
from concurrent.futures import ProcessPoolExecutor
from utilbox.regex_util import regex_key_val
from scipy.spatial.distance import euclidean
from tqdm import tqdm

def parse() :
    parser = argparse.ArgumentParser(description='python wrapper for Kaldi\'s computer-wer')
    parser.add_argument('--ref', required=True, help='reference transcription  (with key)')
    parser.add_argument('--hyp', required=True, help='hypothesis transcription (with key)')
    return parser.parse_args()
    pass

if __name__ == '__main__':
    args = parse()
    list_kv_ref = regex_key_val.findall(open(args.ref).read())
    list_kv_hyp = regex_key_val.findall(open(args.hyp).read())

    list_key_ref = [x[0] for x in list_kv_ref]
    list_key_hyp = [x[0] for x in list_kv_hyp]
    assert list_key_ref == list_key_hyp, "the keys between refs & hyps are not same"
    dict_kv_ref = dict(list_kv_ref)
    dict_kv_hyp = dict(list_kv_hyp)
    total_dist = 0
    total_count = 0
    total_len = 0
    for kk in tqdm(list_key_hyp, ncols=50) :
        v_ref = dict_kv_ref[kk]
        v_hyp = dict_kv_hyp[kk]
        _feat_ref = np.load(v_ref)['feat']
        _feat_hyp = np.load(v_hyp)['feat']
Ejemplo n.º 13
0
import json
import sys
import os
from utilbox.regex_util import regex_key_val

if __name__ == '__main__':
    list_kv = regex_key_val.findall(sys.stdin.read())
    print(json.dumps(dict(list_kv), indent=2))
    pass