def main(est_spk1, est_spk2, egs_spk1, egs_spk2): est_spk1 = AudioReader(est_spk1) est_spk2 = AudioReader(est_spk2) egs_spk1 = AudioReader(egs_spk1) egs_spk2 = AudioReader(egs_spk2) length = len(est_spk1) x = [i for i in range(length)] sdr = [] snr = [] for idx in tqdm.tqdm(range(length)): ests = [est_spk1[idx], est_spk2[idx]] egs = [egs_spk1[idx], egs_spk2[idx]] sdr.append(float(permutation_sdr(ests, egs))) snr.append(float(permute_SI_SNR(ests, egs))) plt.title('Sampels SNR and SDR Results') ax = plt.subplot() tick_spacing = 10 ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing)) plt.scatter(x, sdr, marker='x', color='m', label=u'sdr', s=5) plt.scatter(x, snr, marker='o', color='c', label=u'snr', s=5) plt.legend() #plt.xticks(l, lx) plt.ylabel('value') plt.xlabel('sample index') plt.savefig('convtasnet_results.png') print('Average SNR: {:.5f}'.format(float(sum(snr)) / length)) print('Average SDR: {:.5f}'.format(float(sum(sdr) / length)))
def main(est_spk1, est_spk2, egs_spk1, egs_spk2): est_spk1 = AudioReader(est_spk1) est_spk2 = AudioReader(est_spk2) egs_spk1 = AudioReader(egs_spk1) egs_spk2 = AudioReader(egs_spk2) length = len(est_spk1) x = [i for i in range(length)] sdr = [] snr = [] index = 0 for idx in range(length): ests = [est_spk1[idx], est_spk2[idx]] egs = [egs_spk1[idx], egs_spk2[idx]] mix = egs_spk1[idx] + egs_spk2[idx] _snr, per = permute_SI_SNR(ests, egs, mix) _sdr = permutation_sdr(ests, egs, mix, per) index += 1 if True: sdr.append(float(_sdr)) snr.append(float(_snr)) print('\r{} : {}, SI-SNRi: {:5f}, SDRi: {:5f}'.format( index, length, _snr, _sdr), end='') print('\nAverage SNRi: {:.5f}'.format(float(sum(snr)) / len(sdr))) print('Average SDRi: {:.5f}'.format(float(sum(sdr) / len(sdr))))
def loadFile(self, file): try: ar = AudioReader() self.workingAudio = ar.read(file) self.curFile = file except Exception as e: msg = QMessageBox() msg.setWindowModality(Qt.WindowModal) msg.setText(str(e)) msg.exec_()
def predict(self, test_data_path): test_data_reader = AudioReader(audio_dir=test_data_path, model_settings=self.model_settings, background_noise='', train=False, silence_label=self.silence_label, unknown_label=self.unknown_label, classes=self.classes, fingerprint_type=self.fingerprint_type) set_size = test_data_reader.set_size('testing') results = [] index = [] for i in range(0, set_size, self.batch_size): test_fingerprints, test_ground_truth, fnames = test_data_reader.get_data_random( self.batch_size, i, 0.0, 0.0, 0, 'testing', self.sess, self.features_2d) if self.with_ctc: test_fingerprints, test_ground_truth, test_seq_len = self.model.convert_batch_to_ctc_format( test_fingerprints, fnames, self.audio_processor) test_preds = [] for itm_index, eval_itm in enumerate(test_fingerprints): pred = self.sess.run( [self.predicted_indices], feed_dict={ self.fingerprint_input: eval_itm, self.ground_truth_input: test_ground_truth[itm_index], self.seq_len: test_seq_len[itm_index] }) test_preds.append(pred[0]) test_preds = np.array(test_preds) else: test_preds = self.sess.run( [self.predicted_indices], feed_dict={ self.fingerprint_input: test_fingerprints, self.ground_truth_input: test_ground_truth, self.dropout_prob: 1.0 }) test_preds = test_data_reader.label_to_names(test_preds) index.extend([os.path.basename(fname) for fname in fnames]) results.extend(test_preds) df = pd.DataFrame(columns=['fname', 'label']) df['fname'] = index df['label'] = results return df
def __init__(self, mix_path, yaml_path, model, gpuid): super(Separation, self).__init__() self.mix = AudioReader(mix_path, sample_rate=8000) opt = parse(yaml_path, is_tain=False) net = ConvTasNet(**opt['net_conf']) dicts = torch.load(model, map_location='cpu') net.load_state_dict(dicts["model_state_dict"]) self.logger = get_logger(__name__) self.logger.info('Load checkpoint from {}, epoch {: d}'.format( model, dicts["epoch"])) self.net = net.cuda() self.device = torch.device( 'cuda:{}'.format(gpuid[0]) if len(gpuid) > 0 else 'cpu') self.gpuid = tuple(gpuid)
def __init__(self, mix_scp=None, ref_scp=None, sr=8000): super(Datasets, self).__init__() self.mix_audio = AudioReader(mix_scp, sample_rate=sr) self.ref_audio = [AudioReader(r, sample_rate=sr) for r in ref_scp]
def __init__(self, model_name='baseline', model=None, training_steps_list=[5], eval_save_every_step=1, data_path='Train', tmp_dir='tmp', model_path='model', background_frequency=0.8, background_volume=0.1, background_noise='', learning_rates_list=[0.01], batch_size=64, dropout=0.4, model_settings=None, mode='train', with_ctc=False, random_samples_mini_batch = False, silence_label='silence', unknown_label='unknown', classes=[], augmentation_ops=[], augmentation_percentage=0 , validation_percentage=10, testing_percentage=10, unknown_percentage = 10, silence_percentage=10, fingerprint_type='mfcc', testing_list={}, validation_list={}): if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) tf.logging.set_verbosity(tf.logging.INFO) ops.reset_default_graph() # used to reset default graph used by interactive session Interactive Session installs itself as the default session on construction self.data_path = data_path self.tmp_dir = tmp_dir self.model_path = model_path self.background_frequency = background_frequency self.background_volume = background_volume self.with_ctc = with_ctc self.random_samples_mini_batch = random_samples_mini_batch self.model_settings = model_settings self.model_name = model_name self.mode = mode self.audio_processor = AudioReader(audio_dir= self.data_path,model_settings= self.model_settings, silence_label= silence_label,unknown_label= unknown_label,classes= classes, background_noise = background_noise,augmentation_ops = augmentation_ops, augmentation_percentage = augmentation_percentage, fingerprint_type=fingerprint_type, mode=self.mode, validation_percentage=validation_percentage, testing_percentage=testing_percentage, unknown_percentage=unknown_percentage, silence_percentage= silence_percentage, testing_list = testing_list, validation_list= validation_list) self.silence_label = silence_label self.unknown_label = unknown_label self.classes = classes self.fingerprint_type = fingerprint_type self.batch_size = batch_size self.dropout = dropout self.sess = tf.InteractiveSession() # convert training_steps_list to training_steps self.training_steps_list = training_steps_list self.training_steps_max = np.sum(self.training_steps_list) self.learning_rates_list = learning_rates_list self.time_shift_samples = self.model_settings['time_shift_samples'] self.eval_every_n_steps = eval_save_every_step self.features_2d = model_name=='vggnet' or model_name=='lstm' print( 'Total Number of steps {} and eval every {} steps'.format(self.training_steps_max, self.eval_every_n_steps)) print( 'Total Number of Audio wavs {}'.format(self.audio_processor.set_size('training'))) self.fingerprint_input,self.ground_truth_input,self.seq_len = model.get_in_ground_truth() self.logits, self.dropout_prob = model.get_logits_dropout(self.fingerprint_input,self.seq_len) with tf.name_scope('Loss'): self.loss_mean = model.get_loss(self.logits,self.ground_truth_input,self.seq_len) tf.summary.scalar('Loss', self.loss_mean) with tf.name_scope('train'): self.learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') self.train_step = model.get_optimizer(self.learning_rate_input, self.loss_mean) self.predicted_indices, self.correct_prediction,self.confusion_matrix = model.get_confusion_matrix_correct_labels(self.ground_truth_input,self.logits, self.seq_len,self.audio_processor) self.evaluation_step = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32)) tf.summary.scalar('accuracy', self.evaluation_step) self.global_step = tf.train.get_or_create_global_step() self.increment_global_step = tf.assign(self.global_step, self.global_step + 1) self.saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) self.merged_summaries = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.tmp_dir + '/train', self.sess.graph) self.validation_writer = tf.summary.FileWriter(self.tmp_dir + '/validation') tf.global_variables_initializer().run() self.start_step = 1 tf.logging.info('Training from step: %d ', self.start_step) tf.train.write_graph(self.sess.graph_def, self.model_path, model_name + '.pbtxt') self.model = model
class Estimator: def __init__(self, model_name='baseline', model=None, training_steps_list=[5], eval_save_every_step=1, data_path='Train', tmp_dir='tmp', model_path='model', background_frequency=0.8, background_volume=0.1, background_noise='', learning_rates_list=[0.01], batch_size=64, dropout=0.4, model_settings=None, mode='train', with_ctc=False, random_samples_mini_batch = False, silence_label='silence', unknown_label='unknown', classes=[], augmentation_ops=[], augmentation_percentage=0 , validation_percentage=10, testing_percentage=10, unknown_percentage = 10, silence_percentage=10, fingerprint_type='mfcc', testing_list={}, validation_list={}): if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) tf.logging.set_verbosity(tf.logging.INFO) ops.reset_default_graph() # used to reset default graph used by interactive session Interactive Session installs itself as the default session on construction self.data_path = data_path self.tmp_dir = tmp_dir self.model_path = model_path self.background_frequency = background_frequency self.background_volume = background_volume self.with_ctc = with_ctc self.random_samples_mini_batch = random_samples_mini_batch self.model_settings = model_settings self.model_name = model_name self.mode = mode self.audio_processor = AudioReader(audio_dir= self.data_path,model_settings= self.model_settings, silence_label= silence_label,unknown_label= unknown_label,classes= classes, background_noise = background_noise,augmentation_ops = augmentation_ops, augmentation_percentage = augmentation_percentage, fingerprint_type=fingerprint_type, mode=self.mode, validation_percentage=validation_percentage, testing_percentage=testing_percentage, unknown_percentage=unknown_percentage, silence_percentage= silence_percentage, testing_list = testing_list, validation_list= validation_list) self.silence_label = silence_label self.unknown_label = unknown_label self.classes = classes self.fingerprint_type = fingerprint_type self.batch_size = batch_size self.dropout = dropout self.sess = tf.InteractiveSession() # convert training_steps_list to training_steps self.training_steps_list = training_steps_list self.training_steps_max = np.sum(self.training_steps_list) self.learning_rates_list = learning_rates_list self.time_shift_samples = self.model_settings['time_shift_samples'] self.eval_every_n_steps = eval_save_every_step self.features_2d = model_name=='vggnet' or model_name=='lstm' print( 'Total Number of steps {} and eval every {} steps'.format(self.training_steps_max, self.eval_every_n_steps)) print( 'Total Number of Audio wavs {}'.format(self.audio_processor.set_size('training'))) self.fingerprint_input,self.ground_truth_input,self.seq_len = model.get_in_ground_truth() self.logits, self.dropout_prob = model.get_logits_dropout(self.fingerprint_input,self.seq_len) with tf.name_scope('Loss'): self.loss_mean = model.get_loss(self.logits,self.ground_truth_input,self.seq_len) tf.summary.scalar('Loss', self.loss_mean) with tf.name_scope('train'): self.learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') self.train_step = model.get_optimizer(self.learning_rate_input, self.loss_mean) self.predicted_indices, self.correct_prediction,self.confusion_matrix = model.get_confusion_matrix_correct_labels(self.ground_truth_input,self.logits, self.seq_len,self.audio_processor) self.evaluation_step = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32)) tf.summary.scalar('accuracy', self.evaluation_step) self.global_step = tf.train.get_or_create_global_step() self.increment_global_step = tf.assign(self.global_step, self.global_step + 1) self.saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) self.merged_summaries = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.tmp_dir + '/train', self.sess.graph) self.validation_writer = tf.summary.FileWriter(self.tmp_dir + '/validation') tf.global_variables_initializer().run() self.start_step = 1 tf.logging.info('Training from step: %d ', self.start_step) tf.train.write_graph(self.sess.graph_def, self.model_path, model_name + '.pbtxt') self.model = model def save_weights(self, weights_file_path): weight1 = self.model.weights['h1'].eval(self.sess) weight2 = self.model.weights['h2'].eval(self.sess) weight3 = self.model.weights['h3'].eval(self.sess) weight4 = self.model.weights['output'].eval(self.sess) bias1 = self.model.biases['h1'].eval(self.sess) bias2 = self.model.biases['h2'].eval(self.sess) bias3 = self.model.biases['h3'].eval(self.sess) bias4 = self.model.biases['output'].eval(self.sess) CurrentDateString = "{}_{}".format(str(date.today()).replace("-", ""), datetime.now().strftime("%H_%M_%S")) spio.savemat(weights_file_path.format("dnn128", CurrentDateString), {'w1': weight1, 'w2': weight2, 'w3': weight3, 'w4': weight4, 'b1': bias1, 'b2': bias2, 'b3': bias3, 'b4': bias4}) def load(self, checkpoint_path): self.saver.restore(self.sess, checkpoint_path) def train(self): for training_step in range(self.start_step, self.training_steps_max + 1): training_steps_sum = 0 for i in range(len(self.training_steps_list)): training_steps_sum += self.training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = self.learning_rates_list[i] break # getting data for training if self.random_samples_mini_batch: train_fingerprints, train_ground_truth, f_names = self.audio_processor.get_data_random(self.batch_size, 0, self.background_frequency, self.background_volume, self.time_shift_samples, 'training', self.sess, self.features_2d) else: train_fingerprints, train_ground_truth, f_names = self.audio_processor.get_data(self.batch_size, 0, self.background_frequency, self.background_volume, self.time_shift_samples, 'training', self.sess, self.batch_size, training_step, features_2d=self.features_2d) if self.with_ctc: train_inputs, train_targets, train_seq_len = self.model.convert_batch_to_ctc_format(train_fingerprints, f_names,self.audio_processor) train_accuracy = 0 loss_value = 0 for itm_index, train_itm in enumerate(train_inputs): train_summary, accuracy, current_loss, _, _ = self.sess.run( [ self.merged_summaries, self.evaluation_step, self.loss_mean, self.train_step, self.increment_global_step ], feed_dict={ self.fingerprint_input: train_itm, self.ground_truth_input: train_targets[itm_index], self.learning_rate_input: learning_rate_value, self.seq_len: train_seq_len[itm_index] }) train_accuracy += accuracy * self.batch_size loss_value += current_loss * self.batch_size self.train_writer.add_summary(train_summary, training_step) train_accuracy /= self.audio_processor.set_size('training') loss_value /= self.audio_processor.set_size('training') else: # run the graph with this batch of training data. train_summary, train_accuracy, loss_value, _, _ = self.sess.run( [ self.merged_summaries, self.evaluation_step, self.loss_mean, self.train_step, self.increment_global_step ], feed_dict={ self.fingerprint_input: train_fingerprints, self.ground_truth_input: train_ground_truth, self.learning_rate_input: learning_rate_value, self.dropout_prob: self.dropout }) self.train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, Loss %f' % (training_step, learning_rate_value, train_accuracy * 100, loss_value)) is_last_step = (training_step == self.training_steps_max) if (training_step % self.eval_every_n_steps) == 0 or is_last_step: set_size = self.audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, self.batch_size): validation_fingerprints, validation_ground_truth, f_names = ( self.audio_processor.get_data_random(self.batch_size, i , 0.0, 0.0, 0, 'validation', self.sess,self.features_2d)) batch_size_local = min(self.batch_size, set_size - i) validation_summary, validation_accuracy, conf_matrix = self.eval(validation_fingerprints, validation_ground_truth, f_names, batch_size_local, set_size) self.validation_writer.add_summary(validation_summary, training_step) total_accuracy += (validation_accuracy * batch_size_local) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % ( training_step, total_accuracy * 100, set_size)) checkpoint_path = os.path.join(self.model_path, self.model_name + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) self.saver.save(self.sess, checkpoint_path, global_step=training_step) if self.mode == 'test': set_size = self.audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in range(0, set_size, self.batch_size): test_fingerprints, test_ground_truth, f_names = self.audio_processor.get_data_random(self.batch_size, i, 0.0, 0.0, 0, 'testing', self.sess,self.features_2d) batch_size_local = min(self.batch_size, set_size - i) _, test_accuracy, conf_matrix = self.eval(test_fingerprints, test_ground_truth, f_names, batch_size_local, set_size) if conf_matrix is not None: total_accuracy += (test_accuracy * batch_size_local) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size)) return total_conf_matrix, total_accuracy * 100, set_size def predict(self, test_data_path): test_data_reader = AudioReader(audio_dir= test_data_path,model_settings= self.model_settings,background_noise='', train=False,silence_label=self.silence_label, unknown_label = self.unknown_label,classes = self.classes,fingerprint_type = self.fingerprint_type) set_size = test_data_reader.set_size('testing') results = [] index = [] for i in range(0, set_size, self.batch_size): test_fingerprints, test_ground_truth, fnames = test_data_reader.get_data_random(self.batch_size, i, 0.0, 0.0, 0, 'testing', self.sess,self.features_2d) if self.with_ctc: test_fingerprints, test_ground_truth, test_seq_len = self.model.convert_batch_to_ctc_format(test_fingerprints, fnames,self.audio_processor) test_preds = [] for itm_index, eval_itm in enumerate(test_fingerprints): pred = self.sess.run( [self.predicted_indices], feed_dict={ self.fingerprint_input: eval_itm, self.ground_truth_input: test_ground_truth[itm_index], self.seq_len: test_seq_len[itm_index] }) test_preds.append(pred[0]) test_preds = np.array(test_preds) else: test_preds = self.sess.run( [self.predicted_indices], feed_dict={ self.fingerprint_input: test_fingerprints, self.ground_truth_input: test_ground_truth, self.dropout_prob: 1.0 }) test_preds = test_data_reader.label_to_names(test_preds) index.extend([os.path.basename(fname) for fname in fnames]) results.extend(test_preds) df = pd.DataFrame(columns=['fname', 'label']) df['fname'] = index df['label'] = results return df def eval(self, eval_fingerprints, eval_ground_truth, f_names, batch_size_local, set_size): conf_matrix = None if self.with_ctc: eval_inputs, eval_targets, eval_seq_len = self.model.convert_batch_to_ctc_format(eval_fingerprints, f_names,self.audio_processor) eval_accuracy = 0 for itm_index, eval_itm in enumerate(eval_inputs): eval_summary, accuracy, val_conf_matrix = self.sess.run( [self.merged_summaries, self.evaluation_step, self.confusion_matrix], feed_dict={ self.fingerprint_input: eval_itm, self.ground_truth_input: eval_targets[itm_index], self.seq_len: eval_seq_len[itm_index] }) eval_accuracy += accuracy * batch_size_local if conf_matrix is None: conf_matrix = val_conf_matrix else: conf_matrix += val_conf_matrix eval_accuracy /= set_size else: eval_summary, eval_accuracy, conf_matrix = self.sess.run( [self.merged_summaries, self.evaluation_step, self.confusion_matrix], feed_dict={ self.fingerprint_input: eval_fingerprints, self.ground_truth_input: eval_ground_truth, self.dropout_prob: 1.0 }) return eval_summary, eval_accuracy, conf_matrix