Example #1
0
def main(est_spk1, est_spk2, egs_spk1, egs_spk2):
    est_spk1 = AudioReader(est_spk1)
    est_spk2 = AudioReader(est_spk2)
    egs_spk1 = AudioReader(egs_spk1)
    egs_spk2 = AudioReader(egs_spk2)
    length = len(est_spk1)
    x = [i for i in range(length)]
    sdr = []
    snr = []
    index = 0
    for idx in range(length):
        ests = [est_spk1[idx], est_spk2[idx]]
        egs = [egs_spk1[idx], egs_spk2[idx]]
        mix = egs_spk1[idx] + egs_spk2[idx]

        _snr, per = permute_SI_SNR(ests, egs, mix)
        _sdr = permutation_sdr(ests, egs, mix, per)

        index += 1
        if True:
            sdr.append(float(_sdr))
            snr.append(float(_snr))
            print('\r{} : {}, SI-SNRi: {:5f}, SDRi: {:5f}'.format(
                index, length, _snr, _sdr),
                  end='')

    print('\nAverage SNRi: {:.5f}'.format(float(sum(snr)) / len(sdr)))
    print('Average SDRi: {:.5f}'.format(float(sum(sdr) / len(sdr))))
Example #2
0
def main(est_spk1, est_spk2, egs_spk1, egs_spk2):
    est_spk1 = AudioReader(est_spk1)
    est_spk2 = AudioReader(est_spk2)
    egs_spk1 = AudioReader(egs_spk1)
    egs_spk2 = AudioReader(egs_spk2)
    length = len(est_spk1)
    x = [i for i in range(length)]
    sdr = []
    snr = []
    for idx in tqdm.tqdm(range(length)):
        ests = [est_spk1[idx], est_spk2[idx]]
        egs = [egs_spk1[idx], egs_spk2[idx]]
        sdr.append(float(permutation_sdr(ests, egs)))
        snr.append(float(permute_SI_SNR(ests, egs)))

    plt.title('Sampels SNR and SDR Results')
    ax = plt.subplot()
    tick_spacing = 10
    ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))

    plt.scatter(x, sdr, marker='x', color='m', label=u'sdr', s=5)
    plt.scatter(x, snr, marker='o', color='c', label=u'snr', s=5)
    plt.legend()
    #plt.xticks(l, lx)
    plt.ylabel('value')
    plt.xlabel('sample index')
    plt.savefig('convtasnet_results.png')
    print('Average SNR: {:.5f}'.format(float(sum(snr)) / length))
    print('Average SDR: {:.5f}'.format(float(sum(sdr) / length)))
    def loadFile(self, file):

        try:
            ar = AudioReader()
            self.workingAudio = ar.read(file)

            self.curFile = file
        except Exception as e:
            msg = QMessageBox()
            msg.setWindowModality(Qt.WindowModal)
            msg.setText(str(e))
            msg.exec_()
Example #4
0
 def __init__(self, mix_path, yaml_path, model, gpuid):
     super(Separation, self).__init__()
     self.mix = AudioReader(mix_path, sample_rate=8000)
     opt = parse(yaml_path, is_tain=False)
     net = ConvTasNet(**opt['net_conf'])
     dicts = torch.load(model, map_location='cpu')
     net.load_state_dict(dicts["model_state_dict"])
     self.logger = get_logger(__name__)
     self.logger.info('Load checkpoint from {}, epoch {: d}'.format(
         model, dicts["epoch"]))
     self.net = net.cuda()
     self.device = torch.device(
         'cuda:{}'.format(gpuid[0]) if len(gpuid) > 0 else 'cpu')
     self.gpuid = tuple(gpuid)
Example #5
0
 def predict(self, test_data_path):
     test_data_reader = AudioReader(audio_dir=test_data_path,
                                    model_settings=self.model_settings,
                                    background_noise='',
                                    train=False,
                                    silence_label=self.silence_label,
                                    unknown_label=self.unknown_label,
                                    classes=self.classes,
                                    fingerprint_type=self.fingerprint_type)
     set_size = test_data_reader.set_size('testing')
     results = []
     index = []
     for i in range(0, set_size, self.batch_size):
         test_fingerprints, test_ground_truth, fnames = test_data_reader.get_data_random(
             self.batch_size, i, 0.0, 0.0, 0, 'testing', self.sess,
             self.features_2d)
         if self.with_ctc:
             test_fingerprints, test_ground_truth, test_seq_len = self.model.convert_batch_to_ctc_format(
                 test_fingerprints, fnames, self.audio_processor)
             test_preds = []
             for itm_index, eval_itm in enumerate(test_fingerprints):
                 pred = self.sess.run(
                     [self.predicted_indices],
                     feed_dict={
                         self.fingerprint_input: eval_itm,
                         self.ground_truth_input:
                         test_ground_truth[itm_index],
                         self.seq_len: test_seq_len[itm_index]
                     })
                 test_preds.append(pred[0])
             test_preds = np.array(test_preds)
         else:
             test_preds = self.sess.run(
                 [self.predicted_indices],
                 feed_dict={
                     self.fingerprint_input: test_fingerprints,
                     self.ground_truth_input: test_ground_truth,
                     self.dropout_prob: 1.0
                 })
         test_preds = test_data_reader.label_to_names(test_preds)
         index.extend([os.path.basename(fname) for fname in fnames])
         results.extend(test_preds)
     df = pd.DataFrame(columns=['fname', 'label'])
     df['fname'] = index
     df['label'] = results
     return df
Example #6
0
 def __init__(self, mix_scp=None, ref_scp=None, sr=8000):
     super(Datasets, self).__init__()
     self.mix_audio = AudioReader(mix_scp, sample_rate=sr)
     self.ref_audio = [AudioReader(r, sample_rate=sr) for r in ref_scp]
Example #7
0
 def __init__(self, model_name='baseline', model=None,
              training_steps_list=[5], eval_save_every_step=1,
              data_path='Train',
              tmp_dir='tmp',
              model_path='model',
              background_frequency=0.8,
              background_volume=0.1,
              background_noise='',
              learning_rates_list=[0.01],
              batch_size=64,
              dropout=0.4,
              model_settings=None,
              mode='train',
              with_ctc=False,
              random_samples_mini_batch = False,
              silence_label='silence',
              unknown_label='unknown',
              classes=[],
              augmentation_ops=[],
              augmentation_percentage=0 ,
              validation_percentage=10,
              testing_percentage=10,
              unknown_percentage = 10,
              silence_percentage=10,
              fingerprint_type='mfcc',
              testing_list={},
              validation_list={}):
     if len(training_steps_list) != len(learning_rates_list):
         raise Exception(
             '--how_many_training_steps and --learning_rate must be equal length '
             'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                        len(learning_rates_list)))
     tf.logging.set_verbosity(tf.logging.INFO)
     ops.reset_default_graph() # used to reset default graph used by interactive session Interactive Session installs itself as the default session on construction
     self.data_path = data_path
     self.tmp_dir = tmp_dir
     self.model_path = model_path
     self.background_frequency = background_frequency
     self.background_volume = background_volume
     self.with_ctc = with_ctc
     self.random_samples_mini_batch = random_samples_mini_batch
     self.model_settings = model_settings
     self.model_name = model_name
     self.mode = mode
     self.audio_processor = AudioReader(audio_dir= self.data_path,model_settings= self.model_settings,
                                        silence_label= silence_label,unknown_label= unknown_label,classes= classes,
                                        background_noise = background_noise,augmentation_ops = augmentation_ops,
                                        augmentation_percentage = augmentation_percentage,
                                        fingerprint_type=fingerprint_type, mode=self.mode,
                                        validation_percentage=validation_percentage,
                                        testing_percentage=testing_percentage,
                                        unknown_percentage=unknown_percentage,
                                        silence_percentage= silence_percentage,
                                        testing_list = testing_list,
                                        validation_list= validation_list)
     self.silence_label = silence_label
     self.unknown_label = unknown_label
     self.classes = classes
     self.fingerprint_type = fingerprint_type
     self.batch_size = batch_size
     self.dropout = dropout
     self.sess = tf.InteractiveSession()
     # convert training_steps_list to training_steps
     self.training_steps_list = training_steps_list
     self.training_steps_max = np.sum(self.training_steps_list)
     self.learning_rates_list = learning_rates_list
     self.time_shift_samples = self.model_settings['time_shift_samples']
     self.eval_every_n_steps = eval_save_every_step
     self.features_2d = model_name=='vggnet' or model_name=='lstm'
     print(
         'Total Number of steps {} and eval every {} steps'.format(self.training_steps_max, self.eval_every_n_steps))
     print(
         'Total Number of Audio wavs {}'.format(self.audio_processor.set_size('training')))
     self.fingerprint_input,self.ground_truth_input,self.seq_len = model.get_in_ground_truth()
     self.logits, self.dropout_prob = model.get_logits_dropout(self.fingerprint_input,self.seq_len)
     with tf.name_scope('Loss'):
         self.loss_mean = model.get_loss(self.logits,self.ground_truth_input,self.seq_len)
     tf.summary.scalar('Loss', self.loss_mean)
     with tf.name_scope('train'):
         self.learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input')
         self.train_step = model.get_optimizer(self.learning_rate_input, self.loss_mean)
     self.predicted_indices, self.correct_prediction,self.confusion_matrix = model.get_confusion_matrix_correct_labels(self.ground_truth_input,self.logits,
                                                                       self.seq_len,self.audio_processor)
     self.evaluation_step = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
     tf.summary.scalar('accuracy', self.evaluation_step)
     self.global_step = tf.train.get_or_create_global_step()
     self.increment_global_step = tf.assign(self.global_step, self.global_step + 1)
     self.saver = tf.train.Saver(tf.global_variables())
     # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
     self.merged_summaries = tf.summary.merge_all()
     self.train_writer = tf.summary.FileWriter(self.tmp_dir + '/train', self.sess.graph)
     self.validation_writer = tf.summary.FileWriter(self.tmp_dir + '/validation')
     tf.global_variables_initializer().run()
     self.start_step = 1
     tf.logging.info('Training from step: %d ', self.start_step)
     tf.train.write_graph(self.sess.graph_def, self.model_path, model_name + '.pbtxt')
     self.model  = model