def extractFeatures(fname, delta=True, mfcc=False, scmc=False, scfc=False, norm=True): _, sig = wavfile.read(fname) sphinx_mfcc_class = MFCC( nfilt=40, #ncep=20, lowerf=100, upperf=8000, wlen=0.02) if mfcc: feats = sphinx_mfcc_class.sig2s2mfc(sig) if scmc or scfc: feats = sphinx_mfcc_class.sig2sc(sig, mag_feats=scmc) feats = np.log(feats) feats = dct(feats, n=40, norm='ortho') if norm: feat_mu = np.mean(feats, axis=0) feats -= feat_mu if delta: delta = calcDelta(feats, 2) ddelta = calcDelta(delta, 2) feats = np.concatenate((feats, delta), axis=1) feats = np.concatenate((feats, ddelta), axis=1) return feats
def __init__(self): super(self.__class__, self).__init__() self.setupUi(self) self.mfcc = MFCC() self.player = audioPlayer.AudioPlayer(self.volumeSlider, self.seekSlider, self.lcdNumber, self.audioPlayBtn, self.audioPauseBtn, self.audioStopBtn) self.init_ui() self.init_databases() self.canvas = None self.actionTest_Data.setDisabled(True) self.actionExit.triggered.connect(self.close) self.actionTraining_Data.triggered.connect(self.open_train_wdw) self.actionBatch_Testing.triggered.connect(self.open_batch_wdw) self.actionAbout_Qt.triggered.connect(QtGui.qApp.aboutQt) self.actionAbout.triggered.connect(self.about) self.openAudioBtn.clicked.connect(self.show_open_dialog) self.extractSaveBtn.clicked.connect(self.extract_features) self.identifyBtn.clicked.connect(self.identify_speaker)
def __init__(self, parent, audio_files, database_name): QtCore.QThread.__init__(self, parent) self.audio_files = audio_files self.mfcc = MFCC() self.par = parent self.variances = [] self.all_features = [] self.db = DatabaseConnector(database_name)
def analysis(stf_files): stf = STF() targets = ['f0', 'f0_delta', 'ap_fc', 'ap_alpha'] variables = locals() for target in targets: variables[target] = [numpy.array([]) for i in xrange(3)] mfcc_data = None for stf_file in stf_files: stf.loadfile(stf_file) voice = (stf.F0 != 0) mfcc = MFCC(stf.SPEC.shape[1] * 2, stf.frequency) intervals = [] past = False for i in xrange(len(voice)): if past and not voice[i]: intervals[-1] = (intervals[-1][0], i) past = False elif not past and voice[i]: intervals.append((i, -1)) past = True if intervals[-1][1] == -1: intervals[-1] = (intervals[-1][0], len(voice)) for interval in intervals: if interval[1] - interval[0] < 5: continue f0_data = stf.F0[interval[0]: interval[1]] f0_delta_data = delta(f0_data) ap_fc_data = stf.APSG[interval[0]: interval[1], 0] * stf.APSG[interval[0]: interval[1], 1] * -1 ap_alpha_data = stf.APSG[interval[0]: interval[1], 0] variables = locals() for name in targets: variables[name][0] = numpy.append(variables[name][0], variables[name + '_data'][:5]) variables[name][1] = numpy.append(variables[name][1], variables[name + '_data']) variables[name][2] = numpy.append(variables[name][2], variables[name + '_data'][-5:]) mfcc_data_interval = numpy.array([mfcc.mfcc(spec) for spec in stf.SPEC[interval[0]: interval[1]]]) mfcc_data_interval = numpy.hstack([mfcc_data_interval, mfcc.delta(mfcc_data_interval)]) if mfcc_data is None: mfcc_data = [mfcc_data_interval, mfcc_data_interval[:5], mfcc_data_interval[-5:]] else: mfcc_data[0] = numpy.vstack((mfcc_data[0], mfcc_data_interval)) mfcc_data[1] = numpy.vstack((mfcc_data[1], mfcc_data_interval[:5])) mfcc_data[2] = numpy.vstack((mfcc_data[2], mfcc_data_interval[-5:])) variables = locals() return [[x.mean() for x in variables[target]] for target in targets], numpy.array(mfcc_data)
def save_spec(source_file, output_file, mfcc_vals, mfcc_dim, pitch): stf = STF() stf.loadfile(source_file) mfcc = MFCC(stf.SPEC.shape[1] * 2, stf.frequency, dimension=mfcc_dim, channels=mfcc_dim) for i in range(mfcc_vals.shape[1]): stf.SPEC[i] = mfcc.imfcc(mfcc_vals[:, i]) stf.F0 = pitch stf.savefile(output_file)
def calc_mfcc(filename, mfcc_dim): stf = STF() stf.loadfile(filename) mfcc_vals = np.empty((mfcc_dim, 0), "float") mfcc = MFCC(stf.SPEC.shape[1] * 2, stf.frequency, dimension=mfcc_dim, channels=mfcc_dim) for j in range(stf.SPEC.shape[0]): res = mfcc.mfcc(stf.SPEC[j]) mfcc_vals = np.hstack([mfcc_vals, res.reshape(mfcc_dim, 1)]) return mfcc_vals
def __init__(self): super(self.__class__, self).__init__() self.setupUi(self) self.init_databases() self.mfcc = MFCC() self.speaker_only_acc = 0.0 self.speaker_word_acc = 0.0 self.test = 1 self.test_phase = 1 self.audio_files = [] self.progressLbl.setVisible(False) self.openAudioBtn.clicked.connect(self.show_open_dialog) self.startTestBtn.clicked.connect(self.start_testing) self.exportCSVBtn.clicked.connect(self.handleSave) self.clrFilesBtn.clicked.connect(self.clear_all_files) self.startTestBtn.setDisabled(True) self.frameSizeVal.setDisabled(True) self.clip = QtGui.QApplication.clipboard()
class FeatureExtractor(object): def __init__(self, n_frames, n_blocks, n_cepstrum=40, n_filters=26): """ n_frames: The number of frames consisted in one block. n_frames consecutive frames form one block n_blocks: The number of blocks. """ self.mfcc = MFCC(n_cepstrum, n_filters) self.n_frames = n_frames self.n_blocks = n_blocks def extract(self, filepath): # TODO explain the extraction algorithm """ filepath: The path to an input file. """ signal = SignalServer(filepath) last = signal.get_last_start_point(self.n_frames) start_points = np.random.randint(low=0, high=last, size=self.n_frames) features = [] for start_point in start_points: frame = signal.get_frame(start_point) mfcc = self.mfcc.calc(frame) features.append(mfcc) features = np.array(features) return features
def __init__(self): print 'Start Building Indexing' self.test = SpeechConvertor() self.featureLoader = MFCC() self.program = "../cmake-build-release/Benchmark__release__" # print program self.cpp_proc = subprocess.Popen([self.program], stdin=subprocess.PIPE, stdout=subprocess.PIPE) self.flag = 1
def __init__(self, n_frames, n_blocks, n_cepstrum=40, n_filters=26): """ n_frames: The number of frames consisted in one block. n_frames consecutive frames form one block n_blocks: The number of blocks. """ self.mfcc = MFCC(n_cepstrum, n_filters) self.n_frames = n_frames self.n_blocks = n_blocks
def main(args): # Read command line args and invoke mic streaming if no file path supplied audio_file = args.audio_file_path if args.audio_file_path: streaming_enabled = False else: streaming_enabled = True # Create the ArmNN inference runner network = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends) # Specify model specific audio data requirements # Overlap value specifies the number of samples to rewind between each data window audio_capture_params = AudioCaptureParams(dtype=np.float32, overlap=2000, min_samples=16000, sampling_freq=16000, mono=True) # Create the preprocessor mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=40, mel_lo_freq=20, mel_hi_freq=4000, num_mfcc_feats=10, frame_len=640, use_htk_method=True, n_fft=1024) mfcc = MFCC(mfcc_params) preprocessor = AudioPreprocessor(mfcc, model_input_size=49, stride=320) # Set threshold for displaying classification and commence stream or file processing threshold = .90 if streaming_enabled: # Initialise audio stream record_stream = CaptureAudioStream(audio_capture_params) record_stream.set_stream_defaults() record_stream.set_recording_duration(args.duration) record_stream.countdown() with sd.InputStream(callback=record_stream.callback): print("Recording audio. Please speak.") while record_stream.is_active: audio_data = record_stream.capture_data() recognise_speech(audio_data, network, preprocessor, threshold) record_stream.is_first_window = False print("\nFinished recording.") # If file path has been supplied read-in and run inference else: print("Processing Audio Frames...") buffer = capture_audio(audio_file, audio_capture_params) for audio_data in buffer: recognise_speech(audio_data, network, preprocessor, threshold)
class TestingThread(QtCore.QThread): def __init__(self, parent, audio_files): QtCore.QThread.__init__(self, parent) self.audio_files = audio_files self.mfcc = MFCC() self.par = parent def run(self): self.emit(QtCore.SIGNAL("update()")) self.mfcc.frame_size = int(self.par.frameSizeVal.currentText()) self.mfcc.overlap = self.mfcc.frame_size / 2 speaker_correct = 0 speaker_word_correct = 0 for index, file_audio in enumerate(self.audio_files): file_audio = str(file_audio) self.audio_signal, self.audio_fs = FileReader.read_audio( file_audio) self.silenced_signal, self.audio_fs = self.mfcc.remove_silence( file_audio) self.num_frames, self.framed_signal = self.mfcc.frame_blocking( self.silenced_signal) self.windowed_signal = self.mfcc.hamm_window(self.framed_signal) self.fft_signal = self.mfcc.calc_fft(self.windowed_signal) self.log_energy, self.fbank = self.mfcc.fbank( self.fft_signal, self.audio_fs) self.features = self.mfcc.features(self.log_energy) self.lvq = LVQ(str(self.par.databaseSelect.currentText())) # result = self.lvq.test_data(self.features[:, 1:14]) # [31, 28, 29, 30, 27, 26, 25, 24, 23, 22, 20, 21, 19] result = self.lvq.test_data( self.features[:, [1, 2, 3, 4, 5, 7, 6, 9, 8, 10, 11, 12, 13]]) print "vote for file " + str(index) + " : " + str(result) # full = str(result[1][0]) if len(result) >= 2 else str(result[0][0]) full = str(result[0][0]) speaker = full[:full.rfind('-')] if full.rfind('-') != -1 else full word = full[full.rfind('-') + 1:] if full.rfind('-') != -1 else "-" self.par.featuresTbl.setItem(index, 2, QtGui.QTableWidgetItem(speaker)) self.par.featuresTbl.setItem(index, 3, QtGui.QTableWidgetItem(word)) if speaker == self.par.featuresTbl.item(index, 0).text(): speaker_correct += 1 if speaker == self.par.featuresTbl.item( index, 0).text() and word == self.par.featuresTbl.item( index, 1).text(): speaker_word_correct += 1 self.par.speaker_word_acc = (speaker_word_correct / float(len(self.audio_files))) * 100 self.par.speaker_only_acc = (speaker_correct / float(len(self.audio_files))) * 100 self.emit(QtCore.SIGNAL("update()")) self.emit(QtCore.SIGNAL("finish()"))
def extract_mfccs(filename): """ Extracts MFCCs of one audio file. """ print "Extracting MFCCs of", filename data = wavfile.read(filename) srate = data[0] samples = data[1] # setup MFCC class (as slaneys toolbox) mfcc_extr = MFCC(nfilt=40, ncep=13, lowerf=133.3333, upperf=5055.4976, alpha=0.97, samprate=srate, frate=100, winsize=256, nfft=512) # extract features features = mfcc_extr.sig2s2mfc(samples) return features
def main(): parser = argparse.ArgumentParser(description="Splitting parameters") parser.add_argument('voice_list_dir', type=str, metavar='v', help="recording list dir") parser.add_argument('save_dir', type=str, metavar='s', help="file save directory") args = parser.parse_args() print('Loading Voice Sample...') print("performing filters/mfcc") vad_obj = webrtcvad.Vad(2) mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025) print('Initializing networks...') e_net, e_optimizer = get_network('e', NETWORKS_PARAMETERS, train=False) with open(args.voice_list_dir) as file_list: line = file_list.readline() line = line.rstrip() i = 1 while line: print('making embedding for {}'.format(line)) embedding = get_embedding(e_net, line, vad_obj, mfc_obj) i += 1 stuff = line.split("/") output_name = "{}_{}".format(stuff[6], stuff[7]) output_name = output_name.replace(".wav", "") output_name = output_name + ".npy" save_dir = os.path.join(args.save_dir, output_name) print('Saving embedding to: {}'.format(save_dir)) np.save(save_dir, embedding.cpu().detach().numpy(), allow_pickle=True) line = file_list.readline() line = line.rstrip() print("complete")
class TestingThread(QtCore.QThread): def __init__(self, parent, audio_files): QtCore.QThread.__init__(self, parent) self.audio_files = audio_files self.mfcc = MFCC() self.par = parent def run(self): self.emit(QtCore.SIGNAL("update()")) self.mfcc.frame_size = int(self.par.frameSizeVal.currentText()) self.mfcc.overlap = self.mfcc.frame_size/2 speaker_correct = 0 speaker_word_correct = 0 for index,file_audio in enumerate(self.audio_files): file_audio = str(file_audio) self.audio_signal, self.audio_fs = FileReader.read_audio(file_audio) self.silenced_signal, self.audio_fs = self.mfcc.remove_silence(file_audio) self.num_frames, self.framed_signal = self.mfcc.frame_blocking(self.silenced_signal) self.windowed_signal = self.mfcc.hamm_window(self.framed_signal) self.fft_signal = self.mfcc.calc_fft(self.windowed_signal) self.log_energy, self.fbank = self.mfcc.fbank(self.fft_signal, self.audio_fs) self.features = self.mfcc.features(self.log_energy) self.lvq = LVQ(str(self.par.databaseSelect.currentText())) # result = self.lvq.test_data(self.features[:, 1:14]) # [31, 28, 29, 30, 27, 26, 25, 24, 23, 22, 20, 21, 19] result = self.lvq.test_data(self.features[:, [1, 2, 3, 4, 5, 7, 6, 9, 8, 10, 11, 12, 13]]) print "vote for file " + str(index) + " : " + str(result) # full = str(result[1][0]) if len(result) >= 2 else str(result[0][0]) full = str(result[0][0]) speaker = full[:full.rfind('-')] if full.rfind('-') != -1 else full word = full[full.rfind('-')+1:] if full.rfind('-') != -1 else "-" self.par.featuresTbl.setItem(index, 2, QtGui.QTableWidgetItem(speaker)) self.par.featuresTbl.setItem(index, 3, QtGui.QTableWidgetItem(word)) if speaker == self.par.featuresTbl.item(index,0).text(): speaker_correct += 1 if speaker == self.par.featuresTbl.item(index,0).text() and word == self.par.featuresTbl.item(index,1).text(): speaker_word_correct += 1 self.par.speaker_word_acc = (speaker_word_correct / float(len(self.audio_files))) * 100 self.par.speaker_only_acc = (speaker_correct / float(len(self.audio_files))) * 100 self.emit(QtCore.SIGNAL("update()")) self.emit(QtCore.SIGNAL("finish()"))
def getfeatures(filepath, figpath="tmp.png", cached=False): wave_data = readprocessedwave(filepath) frames = [windowing(frame) for frame in getframes(wave_data)] if not cached: (s1, e1, s2, e2, s3, e3, amps, zcrs) = enddection(frames) plotends(figpath, wave_data, s1, e1, s2, e2, s3, e3, amps, zcrs) else: filename = filepath.split('/')[-1][:-4] (student, w, k) = filename.split('-') with open("epd/%s.json" % student, 'r') as fin: endpoints = json.loads(fin.read()) (s1, e1, s2, e2, s3, e3) = endpoints[filename] C = MFCC(frames[s3:e3 + 1], cnt=20) D1 = delta(C, 2) D2 = delta(D1, 1) A = LPC(frames[s3:e3 + 1], p=10) features = np.concatenate((C, D1, D2, A), axis=1)[3:C.shape[0] - 3] # amps[s3:e3+1][:, np.newaxis] return features
def get_dataset_files(data_dir, data_ext): data_list = [] headers = ['filepath', 'name'] vad_obj = webrtcvad.Vad(2) mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025) # read data directory print("1") for root, dirs, filenames in os.walk(data_dir): # 根目录, 子目录, 文件名 for filename in filenames: if filename.endswith(data_ext): # 校验文件后缀名 filepath = os.path.join(root, filename) print(filepath) # so hacky, be careful! folder = filepath[len(data_dir):].split('/')[1] print(folder) new_name = filename.split('.')[0] + ".npy" print(new_name) new_path = '/media/fenger/DATA/1 datasets/data/' + folder + "/" new_filepath = new_path + new_name mkdir(new_path) vad_voice = rm_sil(filepath, vad_obj) fbank = get_fbank(vad_voice, mfc_obj) #fbank = fbank.T[np.newaxis, ...] #fbank = torch.from_numpy(fbank.astype('float32')) np.save(new_filepath, fbank) print(fbank.shape) data_list.append({'filepath': new_filepath, 'name': folder}) print('filepath', new_filepath, 'name', folder) with open('test.csv', 'w') as f: f_scv = csv.DictWriter(f, headers) f_scv.writeheader() f_scv.writerows(data_list)
sys.exit() source_list = open(sys.argv[1]).read().strip().split('\n') target_list = open(sys.argv[2]).read().strip().split('\n') assert len(source_list) == len(target_list) learn_data = None square_mean = numpy.zeros(DIMENSION) mean = numpy.zeros(DIMENSION) for i in xrange(len(source_list)): target = STF() target.loadfile(target_list[i]) mfcc = MFCC(target.SPEC.shape[1] * 2, target.frequency, dimension = DIMENSION) target_mfcc = numpy.array([mfcc.mfcc(target.SPEC[frame]) for frame in xrange(target.SPEC.shape[0])]) target_data = numpy.hstack([target_mfcc, mfcc.delta(target_mfcc)]) source = STF() source.loadfile(source_list[i]) mfcc = MFCC(source.SPEC.shape[1] * 2, source.frequency) source_mfcc = numpy.array([mfcc.mfcc(source.SPEC[frame]) for frame in xrange(source.SPEC.shape[0])]) dtw = DTW(source_mfcc, target_mfcc, window = abs(source.SPEC.shape[0] - target.SPEC.shape[0]) * 2) warp_mfcc = dtw.align(source_mfcc) warp_data = numpy.hstack([warp_mfcc, mfcc.delta(warp_mfcc)]) data = numpy.hstack([warp_data, target_data]) if learn_data is None:
def __init__(self, parent, audio_files): QtCore.QThread.__init__(self, parent) self.audio_files = audio_files self.mfcc = MFCC() self.par = parent
class MainWindow(QtGui.QMainWindow, trainingWindow.Ui_MainWdw): def __init__(self): super(self.__class__, self).__init__() self.setupUi(self) self.player = audioPlayer.AudioPlayer(self.volumeSlider, self.seekSlider, self.lcdNumber, self.audioPlayBtn, self.audioPauseBtn, self.audioStopBtn) self.mfcc = MFCC() self.init_ui() self.init_databases() self.actionExit.triggered.connect(self.close) self.actionTraining_Data.setDisabled(True) self.actionTest_Data.triggered.connect(self.open_test_wdw) self.actionBatch_Feature_Extraction.triggered.connect( self.open_batch_wdw) self.actionAbout_Qt.triggered.connect(QtGui.qApp.aboutQt) self.actionAbout.triggered.connect(self.about) self.openAudioBtn.clicked.connect(self.show_open_dialog) self.extractSaveBtn.clicked.connect(self.extract_and_save) self.trainDataBtn.clicked.connect(self.train_data) self.reloadDatabaseBtn.clicked.connect(self.init_databases) def init_databases(self): self.databaseSelect.clear() self.database_list = [ f[:len(f) - 3] for f in listdir('database/') if isfile(join('database/', f)) ] self.databaseSelect.addItems(QtCore.QStringList(self.database_list)) def open_test_wdw(self): # self.hide() self.testWdw = twc.TestingWindow() self.testWdw.show() def open_batch_wdw(self): self.batch_wdw = batch.BatchWindow() self.batch_wdw.show() def show_open_dialog(self): self.audioFile = QtGui.QFileDialog.getOpenFileName( self, 'Open audio file', '', "Audio Files (*.wav)", None, QtGui.QFileDialog.DontUseNativeDialog) if self.audioFile != "": self.featuresTbl.setRowCount(0) self.featuresTbl.setColumnCount(0) self.audioClassInput.setText("") fileName = str(self.audioFile) self.audio_signal, self.audio_fs = FileReader.read_audio(fileName) self.silenced_signal, self.audio_fs = self.mfcc.remove_silence( fileName) self.fsValLbl.setText(": " + str(self.audio_fs) + " Hz") self.sampleValLbl.setText(": " + str(len(self.audio_signal)) + " | " + str(len(self.silenced_signal)) + " (silenced)") self.audioFilenameLbl.setText(": " + fileName[fileName.rfind('/') + 1:len(fileName)]) self.audioClassInput.setText(FileReader.get_output_class(fileName)) self.audioPlayBtn.setDisabled(False) self.extractSaveBtn.setDisabled(False) self.player.set_audio_source(self.audioFile) def finish_thread(self): QtGui.QMessageBox.information(None, "Success!", "Save features to database success!") def update_progress(self): self.n += 1 self.trainProgress.setValue(self.n) def train_data(self): self.lvq = LVQ(str(self.databaseSelect.currentText())) self.trainDataBtn.setDisabled(True) self.iterVal.setDisabled(True) self.learningRDecrVal.setDisabled(True) self.learningRVal.setDisabled(True) self.minAlpha.setDisabled(True) self.n = 0 #self.trainProgress.setRange(0,0) trainingThread = LVQTrainThread(self, self.lvq, self.iterVal, self.learningRVal, self.learningRDecrVal, self.minAlpha, str(self.databaseSelect.currentText())) trainingThread.start() trainingThread.taskFinished.connect(self.finish_training) QtCore.QObject.connect(trainingThread, QtCore.SIGNAL("update()"), self.update_progress) def finish_training(self, final_weight): self.newWeightTbl.setRowCount(final_weight.shape[0]) self.newWeightTbl.setColumnCount(final_weight.shape[1]) self.newWeightTbl.setColumnWidth(0, 500) for i in xrange(final_weight.shape[0]): for j in xrange(final_weight.shape[1]): weight = QtGui.QTableWidgetItem(str(final_weight[i, j])) # print "i: "+str(i)+" j: "+str(j)+" isi: "+str(isi_feature) self.newWeightTbl.setItem(i, j, weight) # self.trainProgress.setRange(0,1) # self.trainProgress.setValue(1) self.trainDataBtn.setDisabled(False) self.iterVal.setDisabled(False) self.learningRDecrVal.setDisabled(False) self.learningRVal.setDisabled(False) self.minAlpha.setDisabled(False) QtGui.QMessageBox.information(None, "Success!", "Training data complete!") def extract_and_save(self): if self.audioClassInput.text() == "": QtGui.QMessageBox.critical( None, "Training Data", "You must provide audio output class!", QtGui.QMessageBox.Ok | QtGui.QMessageBox.Default, QtGui.QMessageBox.NoButton) return False self.num_frames, self.framed_signal = self.mfcc.frame_blocking( self.silenced_signal) self.windowed_signal = self.mfcc.hamm_window(self.framed_signal) self.fft_signal = self.mfcc.calc_fft(self.windowed_signal) self.log_energy, self.fbank = self.mfcc.fbank(self.fft_signal, self.audio_fs) self.features = self.mfcc.features(self.log_energy) self.n = 0 self.trainProgress.setMaximum(self.features.shape[0]) self.trainProgress.setValue(0) insert_feature = DBThread(self, self.audioFile, self.audioClassInput, self.features) insert_feature.start() QtCore.QObject.connect(insert_feature, QtCore.SIGNAL("finish()"), self.finish_thread) QtCore.QObject.connect(insert_feature, QtCore.SIGNAL("update()"), self.update_progress) self.featuresTbl.setRowCount(self.features.shape[0]) self.featuresTbl.setColumnCount(13) for i in xrange(self.features.shape[0]): for j in xrange(1, 14): isi_feature = QtGui.QTableWidgetItem(str(self.features[i, j])) # print "i: "+str(i)+" j: "+str(j)+" isi: "+str(isi_feature) self.featuresTbl.setItem(i, j - 1, isi_feature) def init_ui(self): palette = QtGui.QPalette() palette.setBrush(QtGui.QPalette.Light, QtCore.Qt.darkGray) self.audioPlayBtn.setDisabled(True) self.audioPauseBtn.setDisabled(True) self.audioStopBtn.setDisabled(True) self.extractSaveBtn.setDisabled(True) self.trainProgress.setValue(0) self.lcdNumber.display("00:00") self.lcdNumber.setPalette(palette) def about(self): QtGui.QMessageBox.information( self, "Text Dependent Speaker Verification", "Text Dependent Speaker Verification - the " "Final project software to identify and verify Speaker based on their speech.\n\n" "\xa9 Sukoreno Mukti - 1112051 \n Informatics Engineering Dept. ITHB" ) def closeEvent(self, event): reply = QtGui.QMessageBox.question( self, 'Message', "Are you sure to quit?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No, QtGui.QMessageBox.No) if reply == QtGui.QMessageBox.Yes: event.accept() else: event.ignore()
source = STF() source.loadfile(sys.argv[4]) f0_data = [] for i in source.F0: if i == 0: f0_data.append(i) else: f0_data.append(math.e**((math.log(i) - math.log(f0[0][0])) * math.log(f0[1][1]) / math.log(f0[1][0]) + math.log(f0[0][1]))) source.F0 = numpy.array(f0_data) mfcc = MFCC(source.SPEC.shape[1] * 2, source.frequency) source_mfcc = numpy.array([ mfcc.mfcc(source.SPEC[frame]) for frame in xrange(source.SPEC.shape[0]) ]) source_data = numpy.hstack([source_mfcc, mfcc.delta(source_mfcc)]) middle_mfcc = gmm_first.convert(source_data) middle_data = numpy.hstack([middle_mfcc, mfcc.delta(middle_mfcc)]) output_mfcc = gmm_second.convert(middle_data) output_spec = numpy.array([ mfcc.imfcc(output_mfcc[frame]) for frame in xrange(output_mfcc.shape[0]) ]) source.SPEC = output_spec
class ExtractThread(QtCore.QThread): def __init__(self, parent, audio_files, database_name): QtCore.QThread.__init__(self, parent) self.audio_files = audio_files self.mfcc = MFCC() self.par = parent self.variances = [] self.all_features = [] self.db = DatabaseConnector(database_name) def write_excel(self,rata2): self.all_features = np.asarray(self.all_features).T wbk = xlwt.Workbook("hasil-ekstraksi.xlsx") sheet = wbk.add_worksheet() row = 0 col = 0 for x in xrange(32): sheet.write(0, x, "data-" + str(x)) for index, item in enumerate(self.all_features): for x in xrange(len(item)): try: teext = str(item[x]) sheet.write(row + 1, col, teext) row += 1 except AttributeError: row += 1 row = 0 col += 1 for i in xrange(32): sheet.write(5250,i, str(rata2[i])) wbk.close() def run(self): self.emit(QtCore.SIGNAL("update()")) self.mfcc.frame_size = int(self.par.frameSizeVal.currentText()) self.mfcc.overlap = self.mfcc.frame_size / 2 for index, file_audio in enumerate(self.audio_files): file_audio = str(file_audio) self.audio_signal, self.audio_fs = FileReader.read_audio(file_audio) self.silenced_signal, self.audio_fs = self.mfcc.remove_silence(file_audio) self.num_frames, self.framed_signal = self.mfcc.frame_blocking(self.silenced_signal) self.windowed_signal = self.mfcc.hamm_window(self.framed_signal) self.fft_signal = self.mfcc.calc_fft(self.windowed_signal) self.log_energy, self.fbank = self.mfcc.fbank(self.fft_signal, self.audio_fs) self.features = self.mfcc.features(self.log_energy) # var = [st.variance(self.features[:,i]) for i in xrange(self.mfcc.num_filter)] # [self.all_features.append(self.features[i,:]) for i in xrange(self.features.shape[0])] # self.variances.append(var) features = [] if TYPE == 1: file_id = self.db.insert("files", {"file_path": file_audio}) for i in xrange(self.features.shape[0]): # [31, 28, 29, 30, 27, 26, 25, 24, 23, 22, 20, 21, 19 # features.append([file_id, i, self.features[i, 1:14], str(self.par.featuresTbl.item(index,1).text())]) features.append([file_id, i, self.features[i, [1, 2, 3, 4, 5, 7, 6, 9, 8, 10, 11, 12, 13]], str(self.par.featuresTbl.item(index, 1).text())]) self.db.insert_features(features) else: output_class_id = self.db.insert("output_classes", {"file_path": file_audio, "class": str(FileReader.get_output_class(file_audio))}) for i in xrange(self.features.shape[0]): features.append([output_class_id, i, self.features[i, 1:14]]) self.db.insert_features(features) self.emit(QtCore.SIGNAL("update()")) # self.variances = np.asarray(self.variances) # rata2 = [st.mean(self.variances[:,i]) for i in xrange(self.mfcc.num_filter)] # self.write_excel(rata2) # print str(np.sort(rata2)) # print str(np.argsort(rata2)) self.emit(QtCore.SIGNAL("finish()"))
class TestingWindow(QtGui.QMainWindow, testingWindow.Ui_TestWdw): def __init__(self): super(self.__class__, self).__init__() self.setupUi(self) self.mfcc = MFCC() self.player = audioPlayer.AudioPlayer(self.volumeSlider, self.seekSlider, self.lcdNumber, self.audioPlayBtn, self.audioPauseBtn, self.audioStopBtn) self.init_ui() self.init_databases() self.canvas = None self.actionTest_Data.setDisabled(True) self.actionExit.triggered.connect(self.close) self.actionTraining_Data.triggered.connect(self.open_train_wdw) self.actionBatch_Testing.triggered.connect(self.open_batch_wdw) self.actionAbout_Qt.triggered.connect(QtGui.qApp.aboutQt) self.actionAbout.triggered.connect(self.about) self.openAudioBtn.clicked.connect(self.show_open_dialog) self.extractSaveBtn.clicked.connect(self.extract_features) self.identifyBtn.clicked.connect(self.identify_speaker) def init_ui(self): palette = QtGui.QPalette() palette.setBrush(QtGui.QPalette.Light, QtCore.Qt.darkGray) self.audioPlayBtn.setDisabled(True) self.audioPauseBtn.setDisabled(True) self.audioStopBtn.setDisabled(True) self.extractSaveBtn.setDisabled(True) self.lcdNumber.display("00:00") self.lcdNumber.setPalette(palette) def init_databases(self): self.database_list = [f[:len(f) - 3] for f in listdir('database/') if isfile(join('database/', f))] self.databaseSelect.addItems(QtCore.QStringList(self.database_list)) def open_batch_wdw(self): self.batch_wdw = batch_test.BatchTestWindow() self.batch_wdw.show() def identify_speaker(self): self.lvq = LVQ(str(self.databaseSelect.currentText())) # result = self.lvq.test_data(self.features[:, 1:14]) # [31, 28, 29, 30, 27, 26, 25, 24, 23, 22, 20, 21, 19] result = self.lvq.test_data(self.features[:,[1, 2, 3, 4, 5, 7, 6, 9, 8, 10, 11, 12, 13]]) print "vote : "+str(result) if result[0][0].find('-') != -1: self.speakerVal.setText(": "+str(result[0][0][:result[0][0].find('-')])) self.wordVal.setText(": "+str(result[0][0][result[0][0].find('-')+1:])) else: self.speakerVal.setText(": " + str(result[0][0])) self.wordVal.setVisible(False) self.wordLbl.setVisible(False) def extract_features(self): self.mfcc.frame_size = int(self.frameSizeVal.currentText()) self.mfcc.overlap = self.mfcc.frame_size/2 # frame blocking self.num_frames, self.framed_signal = self.mfcc.frame_blocking(self.silenced_signal) fig = Figure() self.framedSignalPlot = fig.add_subplot(111) self.framedSignalPlot.plot(self.framed_signal.ravel(1)) self.add_figure(fig, self.framedPlotLyt) # windowing self.windowed_signal = self.mfcc.hamm_window(self.framed_signal) fig = Figure() self.windowedSignalPlot = fig.add_subplot(111) self.windowedSignalPlot.plot(self.windowed_signal.ravel(1)) self.add_figure(fig, self.windowedPlotLyt) # hitung FFT self.fft_signal = self.mfcc.calc_fft(self.windowed_signal) fig = Figure() self.fftSignalPlot = fig.add_subplot(111) self.fftSignalPlot.plot(self.fft_signal[:, :self.mfcc.frame_size/2].ravel(1)) self.add_figure(fig, self.fftPloyLyt) # hitung filter bank self.log_energy, self.fbank = self.mfcc.fbank(self.fft_signal, self.audio_fs) fig = Figure() self.melwrapPlot = fig.add_subplot(111) for i in xrange(self.mfcc.num_filter): self.melwrapPlot.plot(self.fbank[i, :]) self.add_figure(fig, self.melPlotLyt) # features self.features = self.mfcc.features(self.log_energy) fig = Figure() self.mfccPlot = fig.add_subplot(111) for i in xrange(self.features.shape[0]): self.mfccPlot.plot(self.features[i, :]) self.add_figure(fig, self.mfccPlotLyt) # write features to table self.testDataTab.setCurrentIndex(len(self.testDataTab)-1) self.featuresTbl.setRowCount(self.features.shape[0]) for i in xrange(self.features.shape[0]): for j in xrange(1,14): isi_feature = QtGui.QTableWidgetItem(str(self.features[i,j])) # print "i: "+str(i)+" j: "+str(j)+" isi: "+str(isi_feature) self.featuresTbl.setItem(i,j-1,isi_feature) def add_figure(self, fig, container): # if self.canvas is not None: # container.removeWidget(self.canvas) self.clearLayout(container) self.canvas = FigureCanvas(fig) container.addWidget(self.canvas) self.canvas.draw() def open_train_wdw(self): self.hide() self.mainWdw = twc.MainWindow() self.mainWdw.show() def show_open_dialog(self): self.audioFile = QtGui.QFileDialog.getOpenFileName(self, 'Open audio file', '', "Audio Files (*.wav)", None, QtGui.QFileDialog.DontUseNativeDialog) if self.audioFile != "": fileName = str(self.audioFile) self.audio_signal, self.audio_fs = FileReader.read_audio(fileName) self.silenced_signal, self.audio_fs = self.mfcc.remove_silence(fileName) self.fsValLbl.setText(": " + str(self.audio_fs) + " Hz") self.sampleValLbl.setText( ": " + str(len(self.audio_signal)) + " | " + str(len(self.silenced_signal)) + " (silenced)") self.audioFilenameLbl.setText(": " + fileName[fileName.rfind('/') + 1:len(fileName)]) self.audioPlayBtn.setDisabled(False) self.clear_all_layout() fig = Figure() self.origSignalPlot = fig.add_subplot(111) self.origSignalPlot.plot(self.audio_signal) self.add_figure(fig, self.originalPlotLyt) self.extractSaveBtn.setDisabled(False) self.player.set_audio_source(self.audioFile) self.testDataTab.setCurrentIndex(0) def about(self): QtGui.QMessageBox.information(self, "Text Dependent Speaker Verification", "Text Dependent Speaker Verification - the " "Final project software to identify and verify Speaker based on their speech.\n\n" "\xa9 Sukoreno Mukti - 1112051 \n Informatics Engineering Dept. ITHB") def clear_all_layout(self): [self.clearLayout(layout) for layout in [self.fftPloyLyt, self.framedPlotLyt, self.melPlotLyt, self.mfccPlotLyt, self.originalPlotLyt, self.windowedPlotLyt]] def clearLayout(self, layout): while layout.count(): child = layout.takeAt(0) if child.widget() is not None: child.widget().deleteLater() elif child.layout() is not None: self.clearLayout(child.layout()) def closeEvent(self, event): reply = QtGui.QMessageBox.question(self, 'Message', "Are you sure to quit?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No, QtGui.QMessageBox.No) if reply == QtGui.QMessageBox.Yes: event.accept() else: event.ignore()
gmm_file.close() f0_file = open(sys.argv[3], 'rb') f0 = pickle.load(f0_file) f0_file.close() source = STF() source.loadfile(sys.argv[4]) f0_data = [] for i in source.F0: if i == 0: f0_data.append(i) else: f0_data.append(math.e ** ((math.log(i) - math.log(f0[0][0])) * math.log(f0[1][1]) / math.log(f0[1][0]) + math.log(f0[0][1]))) source.F0 = numpy.array(f0_data) mfcc = MFCC(source.SPEC.shape[1] * 2, source.frequency) source_mfcc = numpy.array([mfcc.mfcc(source.SPEC[frame]) for frame in xrange(source.SPEC.shape[0])]) source_data = numpy.hstack([source_mfcc, mfcc.delta(source_mfcc)]) middle_mfcc = gmm_first.convert(source_data) middle_data = numpy.hstack([middle_mfcc, mfcc.delta(middle_mfcc)]) output_mfcc = gmm_second.convert(middle_data) output_spec = numpy.array([mfcc.imfcc(output_mfcc[frame]) for frame in xrange(output_mfcc.shape[0])]) source.SPEC = output_spec source.savefile(sys.argv[5])
import os import glob import torch import torchvision.utils as vutils import webrtcvad from mfcc import MFCC from config import DATASET_PARAMETERS, NETWORKS_PARAMETERS from network import get_network from utils import voice2face # initialization vad_obj = webrtcvad.Vad(2) mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025) e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False) g_net, _ = get_network('g', NETWORKS_PARAMETERS, train=False) # test voice_path = os.path.join(DATASET_PARAMETERS['test_data'], '*.wav') voice_list = glob.glob(voice_path) for filename in voice_list: face_image = voice2face(e_net, g_net, filename, vad_obj, mfc_obj, NETWORKS_PARAMETERS['GPU']) vutils.save_image(face_image.detach().clamp(-1, 1), filename.replace('.wav', '.png'), normalize=True)
class ExtractThread(QtCore.QThread): def __init__(self, parent, audio_files, database_name): QtCore.QThread.__init__(self, parent) self.audio_files = audio_files self.mfcc = MFCC() self.par = parent self.variances = [] self.all_features = [] self.db = DatabaseConnector(database_name) def write_excel(self, rata2): self.all_features = np.asarray(self.all_features).T wbk = xlwt.Workbook("hasil-ekstraksi.xlsx") sheet = wbk.add_worksheet() row = 0 col = 0 for x in xrange(32): sheet.write(0, x, "data-" + str(x)) for index, item in enumerate(self.all_features): for x in xrange(len(item)): try: teext = str(item[x]) sheet.write(row + 1, col, teext) row += 1 except AttributeError: row += 1 row = 0 col += 1 for i in xrange(32): sheet.write(5250, i, str(rata2[i])) wbk.close() def run(self): self.emit(QtCore.SIGNAL("update()")) self.mfcc.frame_size = int(self.par.frameSizeVal.currentText()) self.mfcc.overlap = self.mfcc.frame_size / 2 for index, file_audio in enumerate(self.audio_files): file_audio = str(file_audio) self.audio_signal, self.audio_fs = FileReader.read_audio( file_audio) self.silenced_signal, self.audio_fs = self.mfcc.remove_silence( file_audio) self.num_frames, self.framed_signal = self.mfcc.frame_blocking( self.silenced_signal) self.windowed_signal = self.mfcc.hamm_window(self.framed_signal) self.fft_signal = self.mfcc.calc_fft(self.windowed_signal) self.log_energy, self.fbank = self.mfcc.fbank( self.fft_signal, self.audio_fs) self.features = self.mfcc.features(self.log_energy) # var = [st.variance(self.features[:,i]) for i in xrange(self.mfcc.num_filter)] # [self.all_features.append(self.features[i,:]) for i in xrange(self.features.shape[0])] # self.variances.append(var) features = [] if TYPE == 1: file_id = self.db.insert("files", {"file_path": file_audio}) for i in xrange(self.features.shape[0]): # [31, 28, 29, 30, 27, 26, 25, 24, 23, 22, 20, 21, 19 # features.append([file_id, i, self.features[i, 1:14], str(self.par.featuresTbl.item(index,1).text())]) features.append([ file_id, i, self.features[ i, [1, 2, 3, 4, 5, 7, 6, 9, 8, 10, 11, 12, 13]], str(self.par.featuresTbl.item(index, 1).text()) ]) self.db.insert_features(features) else: output_class_id = self.db.insert( "output_classes", { "file_path": file_audio, "class": str(FileReader.get_output_class(file_audio)) }) for i in xrange(self.features.shape[0]): features.append( [output_class_id, i, self.features[i, 1:14]]) self.db.insert_features(features) self.emit(QtCore.SIGNAL("update()")) # self.variances = np.asarray(self.variances) # rata2 = [st.mean(self.variances[:,i]) for i in xrange(self.mfcc.num_filter)] # self.write_excel(rata2) # print str(np.sort(rata2)) # print str(np.argsort(rata2)) self.emit(QtCore.SIGNAL("finish()"))
def analysis(stf_files): stf = STF() targets = ['f0', 'f0_delta', 'ap_fc', 'ap_alpha'] variables = locals() for target in targets: variables[target] = [numpy.array([]) for i in xrange(3)] mfcc_data = None for stf_file in stf_files: stf.loadfile(stf_file) voice = (stf.F0 != 0) mfcc = MFCC(stf.SPEC.shape[1] * 2, stf.frequency) intervals = [] past = False for i in xrange(len(voice)): if past and not voice[i]: intervals[-1] = (intervals[-1][0], i) past = False elif not past and voice[i]: intervals.append((i, -1)) past = True if intervals[-1][1] == -1: intervals[-1] = (intervals[-1][0], len(voice)) for interval in intervals: if interval[1] - interval[0] < 5: continue f0_data = stf.F0[interval[0]:interval[1]] f0_delta_data = delta(f0_data) ap_fc_data = stf.APSG[interval[0]:interval[1], 0] * stf.APSG[interval[0]:interval[1], 1] * -1 ap_alpha_data = stf.APSG[interval[0]:interval[1], 0] variables = locals() for name in targets: variables[name][0] = numpy.append( variables[name][0], variables[name + '_data'][:5]) variables[name][1] = numpy.append(variables[name][1], variables[name + '_data']) variables[name][2] = numpy.append( variables[name][2], variables[name + '_data'][-5:]) mfcc_data_interval = numpy.array([ mfcc.mfcc(spec) for spec in stf.SPEC[interval[0]:interval[1]] ]) mfcc_data_interval = numpy.hstack( [mfcc_data_interval, mfcc.delta(mfcc_data_interval)]) if mfcc_data is None: mfcc_data = [ mfcc_data_interval, mfcc_data_interval[:5], mfcc_data_interval[-5:] ] else: mfcc_data[0] = numpy.vstack((mfcc_data[0], mfcc_data_interval)) mfcc_data[1] = numpy.vstack( (mfcc_data[1], mfcc_data_interval[:5])) mfcc_data[2] = numpy.vstack( (mfcc_data[2], mfcc_data_interval[-5:])) variables = locals() return [[x.mean() for x in variables[target]] for target in targets], numpy.array(mfcc_data)
def __init__(self, fragmentation): Standard.__init__(self,fragmentation) self._mfcc = MFCC(fragmentation)
class XYZMFCC(Standard): def __init__(self, fragmentation): Standard.__init__(self,fragmentation) self._mfcc = MFCC(fragmentation) def setup(self): self._setupLayeredInformation() self._setupActiveFragmentsInformation() #self._validateMultiLayerInformation() if self._do_pymol: self._dump_pymol() if self._do_jmol: self._dump_jmol() def _setupLayeredInformation(self): self._fragment_layers = self._getFragmentLayersFromFragment() def _getFragmentLayersFromFragment(self): fragments = self._fragmentation.getFragments() return array([1 for i in fragments]) def _setupActiveFragmentsInformation(self): self._active_atoms = [] def _dump_pymol(self): from pymol import PymolTemplate pt = PymolTemplate(self._input_filename, self._output_filename) self._setTemplateData(pt) self._writeTemplateFile(pt) def _dump_jmol(self): from jmol import JmolTemplate pt = JmolTemplate(self._input_filename, self._output_filename) self._setTemplateData(pt) self._writeTemplateFile(pt) def _setTemplateData(self, template): template.setFragmentsData(self._fragmentation.getFragments()) template.setBufferData(self._fragment_layers) template.setActiveData(self._active_atoms) template.setBackboneData(self._fragmentation.getBackboneAtoms()) template.setPairData(self._fragmentation.getExplicitlyBreakAtomPairs()) def _writeTemplateFile(self, template): template.override() template.write() def _build_single_fragment(self, fragment, caps): atomnames = ["" for i in fragment] if -1 in fragment: atoms = [None for i in fragment] nucz = [0 for a in atoms] neighbours = [-1 for a in atoms] ids = [-1 for a in atoms] else: atoms = [self._fragmentation.getOBAtom(i) for i in fragment] if self._fragmentation.hasAtomNames(): names = self._fragmentation.getAtomNames() atomnames = [] for i in fragment: try: atomnames.append(names[i-1]) except IndexError: print("Warning: FragIt could not correctly name atom {0:d}.".format(i)) print(" The problem could be with your PDB file.") atomnames.append("X") nucz = [a.GetAtomicNum() for a in atoms] neighbours = [-1 for a in atoms] ids = [i for i in fragment] if caps is not None: for icap,cap in enumerate(caps): if shares_elements( fragment, cap.getAtomIDs() ): for id,atom,atomname,z,nbr in zip(cap.getAtomIDs(), cap.getAtoms(), cap.getAtomNames(), cap.getNuclearCharges(), cap.getNeighbourList() ): if id not in fragment: atoms.append( atom ) atomnames.append( atomname ) nucz.append( z ) neighbours.append( nbr ) ids.append( id ) return Cap(atoms, atomnames, ids, nucz, neighbours) def getCaps(self): return self._mfcc.getCaps() def BuildCappedFragment(self, fragment): return self._build_single_fragment(fragment, self.getCaps()) def BuildFragment(self, fragment): return self._build_single_fragment(fragment, None) def _fragment_xyz(self, fragment ): """Generates the xyz file format based on the atoms, types, ids and neighbours of each fragment """ # NB! the word fragment here is actually of type Cap. Just to be sure # nobody is actually doing something utterly wrong, check that here. if not type(fragment) == Cap: raise ValueError("_fragment_xyz expected an object of type Cap.") atoms = fragment.getAtoms() nuczs = fragment.getNuclearCharges() nbrls = fragment.getNeighbourList() n = len(atoms) s = "%i\n%s\n" % (n,"") for id, (atom, nucz, neighbour) in enumerate(zip(atoms,nuczs,nbrls)): (x,y,z) = (atom.GetX(), atom.GetY(), atom.GetZ()) if atom.GetAtomicNum() != nucz: # atom is the light atom and it is connected to the nbrs[id] atom heavy_atom = self._fragmentation.getOBAtom( neighbour ) (x,y,z) = calculate_hydrogen_position( heavy_atom, atom ) s += "%s %20.12f %20.12f %20.12f\n" % (self._elements.GetSymbol(nucz), x, y, z) return s def writeFile(self, filename): """Dumps all caps and capped fragments to individual files """ ff,ext = getFilenameAndExtension(filename) filename_template = "{0}_{1}_{2:03d}{3}" # these are the capped fragments for ifg,fragment in enumerate(self._fragmentation.getFragments()): capped_fragment = self.BuildCappedFragment( fragment ) ss = self._fragment_xyz( capped_fragment ) with open( filename_template.format(ff, "fragment", ifg+1, ext), 'w' ) as f: f.write(ss) # these are the caps for icap, cap in enumerate( self.getCaps() ): ss = self._fragment_xyz( cap ) with open( filename_template.format(ff, "cap", icap+1, ext), 'w' ) as f: f.write(ss)