def sp_to_mcep(m_sp, n_coeffs=60, alpha=0.77, in_type=3, fft_len=0): #Pre: temp_sp = lu.ins_pid('temp.sp') temp_mgc = lu.ins_pid('temp.mgc') # Writing input data: lu.write_binfile(m_sp, temp_sp) if fft_len is 0: # case fft automatic fft_len = 2 * (np.size(m_sp, 1) - 1) # MCEP: curr_cmd = _sptk_mcep_bin + " -a %1.2f -m %d -l %d -e 1.0E-8 -j 0 -f 0.0 -q %d %s > %s" % ( alpha, n_coeffs - 1, fft_len, in_type, temp_sp, temp_mgc) call(curr_cmd, shell=True) # Read MGC File: m_mgc = lu.read_binfile(temp_mgc, n_coeffs) # Deleting temp files: os.remove(temp_sp) os.remove(temp_mgc) #$sptk/mcep -a $alpha -m $mcsize -l $nFFT -e 1.0E-8 -j 0 -f 0.0 -q 3 $sp_dir/$sentence.sp > $mgc_dir/$sentence.mgc return m_mgc
def sp_to_mcep(m_sp, n_coeffs=60, alpha=0.77, in_type=3, fft_len=0): #Pre: temp_sp = lu.ins_pid('temp.sp') temp_mgc = lu.ins_pid('temp.mgc') # Writing input data: lu.write_binfile(m_sp, temp_sp) if fft_len is 0: # case fft automatic fft_len = 2*(np.size(m_sp,1) - 1) # MCEP: curr_cmd = _sptk_mcep_bin + " -a %1.2f -m %d -l %d -e 1.0E-8 -j 0 -f 0.0 -q %d %s > %s" % (alpha, n_coeffs-1, fft_len, in_type, temp_sp, temp_mgc) call(curr_cmd, shell=True) # Read MGC File: m_mgc = lu.read_binfile(temp_mgc , n_coeffs) # Deleting temp files: os.remove(temp_sp) os.remove(temp_mgc) #$sptk/mcep -a $alpha -m $mcsize -l $nFFT -e 1.0E-8 -j 0 -f 0.0 -q 3 $sp_dir/$sentence.sp > $mgc_dir/$sentence.mgc return m_mgc
def analysis(wav_file, fft_len): est_file = lu.ins_pid('temp.est') la.reaper(wav_file_orig, est_file) m_mag, m_real, m_imag, v_shift, v_voi, m_frm, fs = mp.analysis_with_del_comp__ph_enc__f0_norm__from_files_raw(wav_file, est_file, fft_len) v_f0 = mp.shift_to_f0(v_shift, v_voi, fs, out='f0', b_smooth=True) os.remove(est_file) return m_mag, m_real, m_imag, v_f0
def convert(file_id_list, in_lab_dir, in_feats_dir, fs, out_lab_dir, b_prevent_zeros=False): ''' b_prevent_zeros: True if you want to ensure that all the phonemes have one frame at least. (not recommended, only useful when there are too many utterances crashed) ''' # Conversion: lu.mkdir(out_lab_dir) v_filenames = lu.read_text_file2(file_id_list, dtype='string', comments='#') crashlist_file = lu.ins_pid('crash_file_list.scp') for filename in v_filenames: # Display: print('\nConverting lab file: ' + filename + '................................') # Current i/o files: in_lab_file = os.path.join(in_lab_dir, filename + '.lab') out_lab_file = os.path.join(out_lab_dir, filename + '.lab') in_shift_file = os.path.join(in_feats_dir, filename + '.shift') # Debug: ''' v_shift = lu.read_binfile(in_shift_file, dim=1) v_n_frms = mp.get_num_of_frms_per_state(v_shift, in_lab_file, fs, b_prevent_zeros=b_prevent_zeros) la.convert_label_state_align_to_var_frame_rate(in_lab_file, v_n_frms, out_lab_file) #''' v_n_frms = 0 try: v_shift = lu.read_binfile(in_shift_file, dim=1) v_n_frms = mp.get_num_of_frms_per_state( v_shift, in_lab_file, fs, b_prevent_zeros=b_prevent_zeros, n_states_x_phone=1) la.convert_label_state_align_to_var_frame_rate( in_lab_file, v_n_frms, out_lab_file) except (KeyboardInterrupt, SystemExit): raise except: print("crashlist") with open(crashlist_file, "a") as crashlistlog: crashlistlog.write(filename + '\n') print('Done!')
def analysis(wav_file, fft_len, mvf, nbins_mel=60, nbins_phase=45): est_file = lu.ins_pid('temp.est') la.reaper(wav_file, est_file) m_mag_mel_log, m_real_mel, m_imag_mel, v_shift, v_lf0, fs = mp.analysis_with_del_comp__ph_enc__f0_norm__from_files2( wav_file, est_file, fft_len, mvf, f0_type='lf0', mag_mel_nbins=nbins_mel, cmplx_ph_mel_nbins=nbins_phase) os.remove(est_file) return m_mag_mel_log, m_real_mel, m_imag_mel, v_lf0
def get_pitch_marks(v_sig, fs): temp_wav = lu.ins_pid('temp.wav') temp_pm = lu.ins_pid('temp.pm') sf.write(temp_wav, v_sig, fs) reaper(temp_wav, temp_pm) v_pm = np.loadtxt(temp_pm, skiprows=7) v_pm = v_pm[:, 0] # Protection against REAPER bugs 1: vb_correct = np.hstack((True, np.diff(v_pm) > 0)) v_pm = v_pm[vb_correct] # Protection against REAPER bugs 2 (maybe I need a better protection): if (v_pm[-1] * fs) >= (np.size(v_sig) - 1): v_pm = v_pm[:-1] # Removing temp files: os.remove(temp_wav) os.remove(temp_pm) return v_pm
def get_pitch_marks(v_sig, fs): temp_wav = lu.ins_pid('temp.wav') temp_pm = lu.ins_pid('temp.pm') sf.write(temp_wav, v_sig, fs) reaper(temp_wav, temp_pm) v_pm = np.loadtxt(temp_pm, skiprows=7) v_pm = v_pm[:,0] # Protection against REAPER bugs 1: vb_correct = np.hstack(( True, np.diff(v_pm) > 0)) v_pm = v_pm[vb_correct] # Protection against REAPER bugs 2 (maybe I need a better protection): if (v_pm[-1] * fs) >= (np.size(v_sig)-1): v_pm = v_pm[:-1] # Removing temp files: os.remove(temp_wav) os.remove(temp_pm) return v_pm
def convert(file_id_list, in_lab_dir, in_feats_dir, fs, out_lab_dir, b_prevent_zeros=False): ''' b_prevent_zeros: True if you want to ensure that all the phonemes have one frame at least. (not recommended, only useful when there are too many utterances crashed) ''' # Conversion: lu.mkdir(out_lab_dir) v_filenames = lu.read_text_file2(file_id_list, dtype='string', comments='#') crashlist_file = lu.ins_pid('crash_file_list.scp') for filename in v_filenames: # Display: print('\nConverting lab file: ' + filename + '................................') # Current i/o files: in_lab_file = os.path.join(in_lab_dir , filename + '.lab') out_lab_file = os.path.join(out_lab_dir , filename + '.lab') in_shift_file = os.path.join(in_feats_dir, filename + '.shift') # Debug: ''' v_shift = lu.read_binfile(in_shift_file, dim=1) v_n_frms = mp.get_num_of_frms_per_state(v_shift, in_lab_file, fs, b_prevent_zeros=b_prevent_zeros) la.convert_label_state_align_to_var_frame_rate(in_lab_file, v_n_frms, out_lab_file) #''' try: v_shift = lu.read_binfile(in_shift_file, dim=1) v_n_frms = mp.get_num_of_frms_per_state(v_shift, in_lab_file, fs, b_prevent_zeros=b_prevent_zeros) la.convert_label_state_align_to_var_frame_rate(in_lab_file, v_n_frms, out_lab_file) except (KeyboardInterrupt, SystemExit): raise except: with open(crashlist_file, "a") as crashlistlog: crashlistlog.write(filename + '\n') print('Done!')
# CONSTANTS: So far, the vocoder has been tested only with the following constants:=== fs = 48000 # INPUT:============================================================================== files_scp = '../data/file_id.scp' # List of file names (tokens). Format used by Merlin. in_lab_st_dir = '../data/labs' # Original state aligned label files directory (in the format used by Merlin). in_shift_dir = '../data/params' # Directory containing .shift files (You need to run feature extraction before running this script.) out_lab_st_dir = '../data/labs_var_rate' # Directory that will contain the converted "variable frame rate" state aligned label files. b_prevent_zeros = False # True if you want to ensure that all the phonemes have one frame at least. (not recommended, only usful when there are too many utterances crashed) # PROCESSING:========================================================================= lu.mkdir(out_lab_st_dir) v_fileTokns = lu.read_text_file2(files_scp, dtype='string', comments='#') n_files = len(v_fileTokns) crashlist_file = lu.ins_pid('crash_file_list.scp') for ftkn in v_fileTokns: # Display: print('\nAnalysing file: ' + ftkn + '................................') # Input files: in_lab_st_file = in_lab_st_dir + '/' + ftkn + '.lab' out_lab_st_file = out_lab_st_dir + '/' + ftkn + '.lab' in_shift_file = in_shift_dir + '/' + ftkn + '.shift' try: v_shift = lu.read_binfile(in_shift_file, dim=1) v_n_frms = mp.get_num_of_frms_per_state( v_shift, in_lab_st_file,
fs = 48000 # INPUT:============================================================================== files_scp = '../data_48k/file_id.scp' # List of file names (tokens). Format used by Merlin. in_lab_st_dir = '../data_48k/labs' # Original state aligned label files directory (in the format used by Merlin). in_shift_dir = '../data_48k/params' # Directory containing .shift files (You need to run feature extraction before running this script.) out_lab_st_dir = '../data_48k/labs_var_rate' # Directory that will contain the converted "variable frame rate" state aligned label files. b_prevent_zeros = False # True if you want to ensure that all the phonemes have one frame at least. (not recommended, only usful when there are too many utterances crashed) # PROCESSING:========================================================================= lu.mkdir(out_lab_st_dir) v_fileTokns = lu.read_text_file2(files_scp, dtype='string', comments='#') n_files = len(v_fileTokns) crashlist_file = lu.ins_pid('crash_file_list.scp') for ftkn in v_fileTokns: # Display: print('\nAnalysing file: ' + ftkn + '................................') # Input files: in_lab_st_file = in_lab_st_dir + '/' + ftkn + '.lab' out_lab_st_file = out_lab_st_dir + '/' + ftkn + '.lab' in_shift_file = in_shift_dir + '/' + ftkn + '.shift' try: v_shift = lu.read_binfile(in_shift_file, dim=1) v_n_frms = mp.get_num_of_frms_per_state(v_shift, in_lab_st_file, fs, b_prevent_zeros=b_prevent_zeros, n_states_x_phone=5, nfrms_tolerance=6) # Extraction: