def __init__(self, name, vocab_file, ext='txt'): super(VocabSource, self).__init__(name, ext) self.vocab_file = vocab_file self.vocab = file_io.load_lines(self.vocab_file) self.vocab_map = {token: i for i, token in enumerate(self.vocab)} self.vocab_size = len(self.vocab)
def __init__(self, file_path): """Loads question set and prepares regexes for querying. Attributes: file_path (str): Question set to be loaded. Can be one of the four provided question sets; questions-unilex_dnn_600.hed questions-radio_dnn_416.hed questions-radio_phones_48.hed questions-mandarin.hed questions-japanese.hed """ if file_path in pkg_resources.resource_listdir( 'tts_data_tools', os.path.join('resources', 'question_sets')): print( f'Using tts_data_tools resource from resources/question_sets for {file_path}' ) file_path = pkg_resources.resource_filename( 'tts_data_tools', os.path.join('resources', 'question_sets', file_path)) self.file_path = file_path self.lines = file_io.load_lines(self.file_path) # Ensure the only whitespaces are single space characters. self.lines = list(map(lambda l: re.sub('\s+', ' ', l), self.lines)) self.binary_regexes, self.numerical_regexes = self.compile_questions( self.lines)
def process_dir(festival_dir, txt_dir, id_list, out_dir, custom_voice=None): """Create Utterance structures for all sentences in `id_list` and save them to `out_dir`. Args: festival_dir (str): Directory containing festival installation. txt_dir (str): Directory containing text transcriptions. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. """ file_ids = utils.get_file_ids(id_list=id_list) sentences = [] # For all file_ids load the sentence and add a command to create and save the Utterance structure. for file_id in sorted(file_ids): sentence = file_io.load_lines(os.path.join(txt_dir, f'{file_id}.txt'))[0] sentence = sentence.replace('"', '\\"') sentences.append(sentence) # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place. utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids) # Create and save the Utterance structures. create_utterances(festival_dir, file_ids, sentences, out_dir, custom_voice=custom_voice)
def __init__(self, file_path, state_level=True, states_per_phone=STATES_PER_PHONE): """Loads the label from `file_path` and processes basic information, preparing it for querying. Args: file_path (str): Label file to be loaded. state_level (bool): If True, the labels should be duplicated `self.states_per_phone` times per phone. states_per_phone (int): Number of states in a phone. If `self.state_level` is false, then this will be 1. """ self.file_path = file_path self.base_name = os.path.splitext(os.path.basename(self.file_path))[0] self.state_level = state_level self.states_per_phone = states_per_phone if state_level else 1 self.lines = file_io.load_lines(self.file_path) # Ensure the all whitespaces are single space characters. self.lines = list(map(lambda l: re.sub('\s+', ' ', l), self.lines)) # Extracted labels will not be duplicated for each state in phone. self.labels = self.trim_labels(self.state_level) self.phones = self.extract_phone_identities() # If `self.state_level` is false, then each item in `self.state_in_phone_durations` will be a singleton list. self.state_in_phone_durations, self.phone_durations = self.extract_durations( )
def _full_to_mono(self, full_file_name, mono_file_name, current_phone_regex=re.compile('-(.+?)\+')): phones = [] label = file_io.load_lines(full_file_name) for line in label: phone = current_phone_regex.search(line).group(1) phones.append(phone) file_io.save_lines(phones, mono_file_name) return phones
def load_file(self, base_name, data_dir): r"""Loads lines of text. Parameters ---------- base_name : str The name (without extensions) of the file to be loaded. data_dir : str The directory containing all feature types for this dataset. Returns ------- list<str> """ file_path = self.file_path(base_name, data_dir) return file_io.load_lines(file_path)
def _add_alignments_to_lab(self, mlf, lab_align_dir, lab_dir, file_ids): make_dirs(lab_align_dir, file_ids) with open(mlf, 'r') as f: # Consume the MLF #!header!# line. _ = f.readline() for file_id in file_ids: # Consume the file name line. line = f.readline() mlf_base_name = os.path.splitext(os.path.basename(line))[0] id_base_name = os.path.basename(file_id) if mlf_base_name != id_base_name: raise ValueError( f'The file order in the mlf ({mlf}) does not match file_ids)\n' f'{mlf_base_name} {id_base_name}') label_no_align = file_io.load_lines( os.path.join(lab_dir, f'{file_id}.lab')) label_state_align = [] for label_tag in label_no_align: label_tag = label_tag.strip() for i in range(STATES_PER_PHONE): # Consume a state alignment line. line = f.readline().strip() # Get the alignments for this state. start_time, end_time, *_ = line.split() label_state_align.append( f'{start_time} {end_time} {label_tag}[{i + 2}]') # label_state_align file_io.save_lines( label_state_align, os.path.join(lab_align_dir, f'{file_id}.lab')) # Consume the end of file line marker ('.' character). line = f.readline().strip() if line != '.': raise ValueError('The two files are not matched!')
def sanitise_labs(lab_dir, file_ids, label_out_dir, include_times=False, state_level=False, is_mono=False): utils.make_dirs(label_out_dir, file_ids) for file_id in file_ids: label = file_io.load_lines(os.path.join(lab_dir, f'{file_id}.lab')) n_phones = len(label) start_times, end_times, label = map(list, zip(*map(str.split, label))) start_times, end_times, label = sanitise_silences(start_times, end_times, label, is_mono=is_mono) if state_level: if include_times: n_states = n_phones * STATES_PER_PHONE times = np.interp(range(0, n_states + 1, 1), range(0, n_states + 1, STATES_PER_PHONE), start_times + end_times[-1:]) start_times = times[:-1] end_times = times[1:] label = np.repeat(label, STATES_PER_PHONE).tolist() for i in range(len(label)): state_idx = i % STATES_PER_PHONE label[i] += f'[{state_idx+2}]' if include_times: start_times = list(map(_round_dur, start_times)) end_times = list(map(_round_dur, end_times)) label = list(map(' '.join, zip(*[start_times, end_times, label]))) file_io.save_lines(label, os.path.join(label_out_dir, f'{file_id}.lab'))
def process_file(festival_dir, txt_file, out_dir, custom_voice=None): """Create Utterance structures for all sentences in `txt_file` and save them to `out_dir`. Args: festival_dir (str): Directory containing festival installation. txt_file (str): File containing all transcriptions, with the following schema, (file_id, "sentence transcription")* out_dir (str): Directory to save the output to. """ line_regex = re.compile(r'\(\s*' r'(?P<file_id>.+)' r'\s+' r'"(?P<sentence>.+)"' r'\s*\)') file_ids = [] sentences = [] # For all lines in txt_file extract file_id + sentence and add a command to create and save the Utterance structure. for line in file_io.load_lines(txt_file): match = re.match(line_regex, line) if match is None: print(f'Match not found for the following line,\n{line}') continue file_id = match.group('file_id') file_ids.append(file_id) sentence = match.group('sentence') sentence = sentence.replace('"', '\\"') sentences.append(sentence) # Save the file_ids. file_io.save_lines(file_ids, os.path.join(out_dir, 'file_id_list.scp')) # Create and save the Utterance structures. create_utterances(festival_dir, file_ids, sentences, out_dir, custom_voice=custom_voice)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir): """Processes label files in id_list, saves the phone identities (as a string) to text files. Args: lab_dir (str): Directory containing the label files. id_list (str): List of file basenames to process. out_dir (str): Directory to save the output to. state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level. """ file_ids = utils.get_file_ids(id_list=id_list) utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids) utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids) utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids) for file_id in file_ids: lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab') label_with_pos = file_io.load_lines(lab_path_with_pos) word_start_idxs, _ = get_word_idxs( label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'=')) pos_tags = get_pos_tags(label_with_pos, word_start_idxs) lab_path = os.path.join(lab_dir, f'{file_id}.lab') label = lab_to_feat.Label(lab_path, state_level) durations = label.phone_durations n_frames = np.sum(durations).item() n_phones = len(label.phones) word_start_idxs, word_end_idxs = get_word_idxs( label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'=')) try: segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags) except (ValueError, IndexError) as e: print(f'{e}\n{file_id}') else: wav_path = os.path.join(wav_dir, f'{file_id}.wav') wav, sample_rate = file_io.load_wav(wav_path) f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate) # Match the number of frames between label forced-alignment and vocoder analysis. # Often the durations from forced alignment are a few frames longer than the vocoder features. diff = n_frames - f0.shape[0] if diff > n_phones: raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n' f'\tlabel frames {n_frames}\n' f'\tvocoder frames {f0.shape[0]}\n' f'\tnumber of phones {n_phones}') # Remove excess durations if there is a shape mismatch. if diff > 0: # Remove 1 frame from each phone's duration starting at the end of the sequence. durations[-diff:] -= 1 n_frames = f0.shape[0] print(f'Cropped {diff} frames from durations for utterance {file_id}') assert n_frames == np.sum(durations).item() segment_phone_lens = [] segment_frame_lens = [] for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs): segment_phone_lens.append(segment_end_idx - segment_start_idx) segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx])) file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt')) file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt')) file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))
def train_hmm(self, niter, num_mix, num_splits=1): """ Perform one or more rounds of estimation """ print('---training HMM models') if num_splits != 1: # Call HERest in multiple chunks, split scp in num_splits chunks and save them. print(f'----num_splits set to {num_splits}') train_scp_chunks = [] mfc_files = file_io.load_lines(self.train_scp) random.shuffle(mfc_files) n = (len(mfc_files) + 1) // num_splits mfc_chunks = [ mfc_files[j:j + n] for j in range(0, len(mfc_files), n) ] for i, mfc_chunk in enumerate(mfc_chunks): train_scp_chunk = os.path.join(self.cfg_dir, f'train_{i}.scp') train_scp_chunks.append(train_scp_chunk) file_io.save_lines(mfc_chunk, train_scp_chunk) done = 0 mix = 1 while mix <= num_mix and done == 0: for i in range(niter): next_dir = os.path.join(self.model_dir, f'hmm_mix_{mix}_iter_{i+1}') if not os.path.exists(next_dir): os.makedirs(next_dir) if num_splits == 1: subprocess.run([ self.HERest, '-C', self.cfg, '-S', self.train_scp, '-I', self.phoneme_mlf, '-M', next_dir, '-H', os.path.join(self.cur_dir, MACROS), '-H', os.path.join(self.cur_dir, HMMDEFS), '-t', *PRUNING, self.phonemes ], stdout=subprocess.PIPE, check=True) else: procs = [] # Estimate per chunk. for chunk_num in range(len(train_scp_chunks)): procs.append( subprocess.Popen([ self.HERest, '-C', self.cfg, '-S', train_scp_chunks[chunk_num], '-I', self.phoneme_mlf, '-M', next_dir, '-H', os.path.join(self.cur_dir, MACROS), '-H', os.path.join(self.cur_dir, HMMDEFS), '-t', *PRUNING, '-p', str(chunk_num + 1), self.phonemes ], stdout=subprocess.PIPE)) # Wait until all HERest calls are finished. for p in procs: p.wait() # Now accumulate. subprocess.run([ self.HERest, '-C', self.cfg, '-M', next_dir, '-H', os.path.join(self.cur_dir, MACROS), '-H', os.path.join(self.cur_dir, HMMDEFS), '-t', *PRUNING, '-p', '0', self.phonemes, *glob.glob(next_dir + os.sep + "*.acc") ], stdout=subprocess.PIPE, check=True) self.cur_dir = next_dir if mix * 2 <= num_mix: # Increase mixture number. hed_file = os.path.join(self.cfg_dir, f'mix_{mix * 2}.hed') with open(hed_file, 'w') as f: f.write( f'MU {mix * 2} {{*.state[2-{STATES_PER_PHONE + 2}].mix}}\n' ) next_dir = os.path.join(self.model_dir, f'hmm_mix_{mix * 2}_iter_0') os.makedirs(next_dir, exist_ok=True) subprocess.run([ self.HHEd, '-A', '-H', os.path.join(self.cur_dir, MACROS), '-H', os.path.join(self.cur_dir, HMMDEFS), '-M', next_dir, hed_file, self.phonemes ], check=True) self.cur_dir = next_dir mix *= 2 else: done = 1
def make_proto(self): # make proto means = ' '.join(['0.0' for _ in range(39)]) variances = ' '.join(['1.0' for _ in range(39)]) with open(self.proto, 'w') as f: f.write('~o <VECSIZE> 39 <USER>\n' '~h "proto"\n' '<BEGINHMM>\n' '<NUMSTATES> 7\n') for i in range(2, STATES_PER_PHONE + 2): f.write(f'<STATE> {i}\n<MEAN> 39\n{means}\n') f.write(f'<VARIANCE> 39\n{variances}\n') f.write('<TRANSP> 7\n' ' 0.0 1.0 0.0 0.0 0.0 0.0 0.0\n' ' 0.0 0.6 0.4 0.0 0.0 0.0 0.0\n' ' 0.0 0.0 0.6 0.4 0.0 0.0 0.0\n' ' 0.0 0.0 0.0 0.6 0.4 0.0 0.0\n' ' 0.0 0.0 0.0 0.0 0.6 0.4 0.0\n' ' 0.0 0.0 0.0 0.0 0.0 0.7 0.3\n' ' 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n' '<ENDHMM>\n') # Make vFloors subprocess.run([ self.HCompV, '-f', F, '-C', self.cfg, '-S', self.train_scp, '-M', self.cur_dir, self.proto ], check=True) # Make local macro. with open(os.path.join(self.cur_dir, MACROS), 'w') as f: # Get first three lines from local proto. with open(os.path.join(self.cur_dir, 'proto'), 'r') as source: for _ in range(3): f.write(source.readline()) # Get remaining lines from vFloors. with open(os.path.join(self.cur_dir, VFLOORS), 'r') as source: f.writelines(source.readlines()) # Make hmmdefs. with open(os.path.join(self.cur_dir, HMMDEFS), 'w') as f: with open(self.proto, 'r') as source: # Ignore first two lines. source.readline() source.readline() source_lines = source.readlines() phone_set = file_io.load_lines(self.phonemes) for phone in phone_set: # The header. f.write(f'~h "{phone}"\n') # The rest. f.writelines(source_lines)