Beispiel #1
0
def main(args):
    # MODEL
    num_features = [args.features*i for i in range(1, args.levels+1)] if args.feature_growth == "add" else \
                   [args.features*2**i for i in range(0, args.levels)]
    target_outputs = int(args.output_size * args.sr)
    model = Waveunet(args.channels,
                     num_features,
                     args.channels,
                     args.instruments,
                     kernel_size=args.kernel_size,
                     target_output_size=target_outputs,
                     depth=args.depth,
                     strides=args.strides,
                     conv_type=args.conv_type,
                     res=args.res,
                     separate=args.separate)

    if args.cuda:
        model = utils.DataParallel(model)
        print("move model to gpu")
        model.cuda()

    print("Loading model from checkpoint " + str(args.load_model))
    state = utils.load_model(model, None, args.load_model)
    print('Step', state['step'])

    preds = predict_song(args, args.input, model)

    output_folder = os.path.dirname(
        args.input) if args.output is None else args.output
    for inst in preds.keys():
        utils.write_wav(
            os.path.join(output_folder,
                         os.path.basename(args.input) + "_" + inst + ".wav"),
            preds[inst], args.sr)
Beispiel #2
0
    def write_wav(self, file_path, track_order=None, bit_depth=32):
        """Writes impulse responses to a WAV file

        Args:
            file_path: Path to output WAV file
            track_order: List of speaker-side names for the order of impulse responses in the output file
            bit_depth: Number of bits per sample. 16, 24 or 32

        Returns:
            None
        """
        # Duplicate speaker names as left and right side impulse response names
        if track_order is None:
            track_order = HEXADECAGONAL_TRACK_ORDER

        # Add all impulse responses to a list and save channel names
        irs = []
        ir_order = []
        for speaker, pair in self.irs.items():
            for side, ir in pair.items():
                irs.append(ir.data)
                ir_order.append(f'{speaker}-{side}')

        # Add silent tracks
        for ch in track_order:
            if ch not in ir_order:
                irs.append(np.zeros(len(irs[0])))
                ir_order.append(ch)
        irs = np.vstack(irs)

        # Sort to output order
        irs = irs[[ir_order.index(ch) for ch in track_order], :]

        # Write to file
        write_wav(file_path, self.fs, irs, bit_depth=bit_depth)
Beispiel #3
0
def record_target(file_path, length, fs, channels=2, append=False):
    """Records audio and writes it to a file.

    Args:
        file_path: Path to output file
        length: Audio recording length in samples
        fs: Sampling rate
        channels: Number of channels in the recording
        append: Add track(s) to an existing file? Silence will be added to end of each track to make all equal in
                length

    Returns:
        None
    """
    recording = sd.rec(length, samplerate=fs, channels=channels, blocking=True)
    recording = np.transpose(recording)
    max_gain = 20 * np.log10(np.max(np.abs(recording)))
    if append and os.path.isfile(file_path):
        # Adding to existing file, read the file
        _fs, data = read_wav(file_path, expand=True)
        # Zero pad shorter to the length of the longer
        if recording.shape[1] > data.shape[1]:
            n = recording.shape[1] - data.shape[1]
            data = np.pad(data, [(0, 0), (0, n)])
        elif data.shape[1] > recording.shape[1]:
            recording = np.pad(data, [(0, 0),
                                      (0, data.shape[1] - recording.shape[1])])
        # Add recording to the end of the existing data
        recording = np.vstack([data, recording])
    write_wav(file_path, fs, recording)
    print(f'Headroom: {-1.0*max_gain:.1f} dB')
Beispiel #4
0
    def stop_enroll_record(self):
        self.stop_record()
        print self.recordData[:300]
        signal = np.array(self.recordData, dtype=NPDtype)
        self.enrollWav = (Main.FS, signal)

        # TODO To Delete
        write_wav('enroll.wav', *self.enrollWav)
Beispiel #5
0
    def stop_enroll_record(self):
        self.stop_record()
        print self.recordData[:300]
        signal = np.array(self.recordData, dtype=NPDtype)
        self.enrollWav = (Main.FS, signal)

        # TODO To Delete
        write_wav('enroll.wav', *self.enrollWav)
Beispiel #6
0
    def reco_do_predict(self, fs, signal):
        label = self.backend.predict(fs, signal)
        if not label:
            label = "Nobody"
        print label
        self.recoUsername.setText(label)
        self.Alading.setPixmap(QPixmap(u"image/a_result.png"))
        self.recoUserImage.setPixmap(self.get_avatar(label))

        # TODO To Delete
        write_wav('reco.wav', fs, signal)
Beispiel #7
0
    def reco_do_predict(self, fs, signal):
        label = self.backend.predict(fs, signal)
        if not label:
            label = "Nobody"
        print label
        self.recoUsername.setText(label)
        self.Alading.setPixmap(QPixmap(u"image/a_result.png"))
        self.recoUserImage.setPixmap(self.get_avatar(label))

        # TODO To Delete
        write_wav('reco.wav', fs, signal)
def speaker_diarization(fs, signal, mt_size=2.0, mt_step=0.2, st_win=0.05):
    """
    unsupervised speaker count
    """
    st_step = st_win

    [mid_term_features, short_term_features] = mt_feature_extraction(signal, fs, mt_size * fs,
                                                                     mt_step * fs,
                                                                     round(fs * st_win))
    [mid_term_features_norm, _, _] = normalize_features([mid_term_features.T])
    mid_term_features_norm = mid_term_features_norm[0].T
    num_of_windows = mid_term_features.shape[1]

    # VAD:
    reserved_time = 1
    segment_limits = vad(short_term_features, st_step, smooth_window=0.5, weight=0.3)
    i_vad = ivad(segment_limits, mt_step, reserved_time, num_of_windows)
    mid_term_features_norm = mid_term_features_norm[:, i_vad]

    # remove outliers:
    distances_all = numpy.sum(distance.squareform(distance.pdist(mid_term_features_norm.T)), axis=0)
    m_distances_all = numpy.mean(distances_all)
    i_non_outliers = numpy.nonzero(distances_all < 1.2 * m_distances_all)[0]

    mid_term_features_norm = mid_term_features_norm[:, i_non_outliers]
    i_features_select = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
                         46, 47, 48, 49, 50, 51, 52, 53]
    mid_term_features_norm = mid_term_features_norm[i_features_select, :]

    num_range = range(2, 10)##人数范围[2,10)
    [n_speakers_final, imax, num_speaker_cls] = \
        kmeans_silhouette(mid_term_features_norm, num_range)

    cls = numpy.zeros((num_of_windows,))-1
    valid_pos = i_vad[i_non_outliers]
    for i in range(num_of_windows):
        if i in valid_pos:
            j = numpy.argwhere(valid_pos == i)[0][0]
            cls[i] = num_speaker_cls[imax][j]

    # median filtering:
    cls = scipy.signal.medfilt(cls, 11)
    start = 0
    end = 0
    for i in range(1, len(cls)):
        if cls[i] == cls[i-1]:
            end = i
        else:
            write_wav(os.path.join(os.path.pardir, "result", "result_wav",
                                   str(cls[i-1]) + "-" + str(start*mt_step) + "-" +
                                   str(end*mt_step) + ".wav"),
                      fs, signal[int(start * mt_step * fs):int(end * mt_step * fs)])
            start = i
    return n_speakers_final, cls
def run(args):
    mix_input = WaveReader(args.input, sample_rate=args.fs)
    computer = NnetComputer(args.checkpoint, args.gpu)
    for key, mix_samps in mix_input:
        logger.info("Compute on utterance {}...".format(key))
        spks = computer.compute(mix_samps)
        norm = np.linalg.norm(mix_samps, np.inf)
        for idx, samps in enumerate(spks):
            samps = samps[:mix_samps.size]
            # norm
            samps = samps * norm / np.max(np.abs(samps))
            write_wav(os.path.join(args.dump_dir,
                                   "spk{}/{}.wav".format(idx + 1, key)),
                      samps,
                      fs=args.fs)
    logger.info("Compute over {:d} utterances".format(len(mix_input)))
def main():
    # Open HRIR
    estimator = ImpulseResponseEstimator.from_pickle(TEST_SIGNAL)
    hrir = HRIR(estimator)
    hrir.open_recording(os.path.join(DIR_PATH, 'FL,FR.wav'), speakers=['FL', 'FR'])
    hrir.crop_heads()
    hrir.crop_tails()
    
    # Create test signal sequence
    speakers = ['FL', 'FR']
    seq_data = estimator.sweep_sequence(speakers, 'stereo')

    fig, ax = plot_stereo_track(seq_data, estimator.fs)
    fig.suptitle('Sweep sequence')

    left = np.vstack([
        hrir.irs['FL']['left'].convolve(seq_data[0]),
        hrir.irs['FL']['right'].convolve(seq_data[0])
    ])
    right = np.vstack([
        hrir.irs['FR']['left'].convolve(seq_data[1]),
        hrir.irs['FR']['right'].convolve(seq_data[1])
    ])
    virtualized = left + right

    fig, ax = plot_stereo_track(virtualized, estimator.fs)
    fig.suptitle('Sweep sequence convolved with HRIR')
    plt.show()

    # Virtualize sine sweep sequence with HRIR
    # virtualized = []
    # for i, speaker in enumerate(speakers):
    #     track = seq_data[i, :]
    #     virtualized.append(np.sum([
    #         hrir.irs[speaker]['left'].convolve(track),
    #         hrir.irs[speaker]['right'].convolve(track)
    #     ], axis=0))

    virtualized = np.vstack(virtualized)

    # Normalized to 0 dB
    virtualized /= np.max(np.abs(virtualized))

    # Write virtualized sequence to disk
    file_path = os.path.join(DIR_PATH, f'headphones-sweep-seq-{",".join(speakers)}-stereo-{estimator.file_name(32)}.wav')
    write_wav(file_path, estimator.fs, virtualized, bit_depth=32)
Beispiel #11
0
def run(args):
    voice_spliter = VoiceSpliter(args.voiced_threshold, args.tolerated_size)
    wave_reader = WaveReader(args.wav_scp)
    L, S = args.frame_length, args.frame_shift
    samp_rate = args.sample_rate
    for key, wave in wave_reader:
        voice_spliter.reset()
        num_frames = (wave.size - L) // S + 1
        for idx in range(num_frames):
            voice_spliter.run(wave[idx * S:idx * S + L])
        segments = voice_spliter.segments
        if len(segments) % 2:
            segments.append(num_frames)
        logger.info("{} segments: {}".format(key, segments))
        for idx in range(len(segments) // 2):
            beg, end = segments[idx * 2:idx * 2 + 2]
            if (end - beg) * S / samp_rate < args.min_dur:
                continue
            voiced_segement = wave[beg * S:end * S]
            write_wav(
                os.path.join(args.dump_dir,
                             "{}-{:d}-{:d}.wav".format(key, beg, end)),
                voiced_segement, samp_rate)
Beispiel #12
0
def getMUSDBHQ(database_path):
    subsets = list()

    for subset in ["train", "test"]:
        print("Loading " + subset + " set...")
        tracks = glob.glob(os.path.join(database_path, subset, "*"))
        samples = list()

        # Go through tracks
        for track_folder in sorted(tracks):
            # Skip track if mixture is already written, assuming this track is done already
            example = dict()
            for stem in ["mix", "bass", "drums", "other", "vocals"]:
                filename = stem if stem != "mix" else "mixture"
                audio_path = os.path.join(track_folder, filename + ".wav")
                example[stem] = audio_path

            # Add other instruments to form accompaniment
            acc_path = os.path.join(track_folder, "accompaniment.wav")

            if not os.path.exists(acc_path):
                print("Writing accompaniment to " + track_folder)
                stem_audio = []
                for stem in ["bass", "drums", "other"]:
                    audio, sr = load(example[stem], sr=None, mono=False)
                    stem_audio.append(audio)
                acc_audio = np.clip(sum(stem_audio), -1.0, 1.0)
                write_wav(acc_path, acc_audio, sr)

            example["accompaniment"] = acc_path

            samples.append(example)

        subsets.append(samples)

    return subsets
Beispiel #13
0
def evaluate_for_enhanced(args, dataset, model):
    dB_list_pesq = dict()
    dB_list_name_pesq = dict()
    dB_list_stoi = dict()
    dB_list_name_stoi = dict()
    if args.outside_test:
        for i in ['-7.5', '-2.5', '2.5', '7.5']:
            dB_list_pesq[i] = list()
            dB_list_name_pesq[i] = list()

            dB_list_stoi[i] = list()
            dB_list_name_stoi[i] = list()
            test_noise_file = "outside_test/noise"
    else:
        for i in ['-10', '-5', '0', '5', '10']:
            dB_list_pesq[i] = list()
            dB_list_name_pesq[i] = list()

            dB_list_stoi[i] = list()
            dB_list_name_stoi[i] = list()
            test_noise_file = "test/noise"
    noise_dir = os.path.join(args.dataset_dir, test_noise_file)
    noise_file = os.listdir(noise_dir)
    dB_noise_pesq = {}
    for i in noise_file:
        dB_noise_pesq[os.path.splitext(i)[0]] = list()

    model.eval()
    with torch.no_grad():
        with tqdm(total=len(dataset)) as pbar:
            for example in dataset:
                # Load source references in their original sr and channel number
                target_sources = utils.load(example['target'],
                                            sr=16000,
                                            mono=True)[0].flatten()
                # Predict using mixture
                pred_sources = predict_song(args, example["input"],
                                            model).flatten()
                # write wav
                file_name = os.path.basename(example['input'])
                if args.write_to_wav:
                    utils.write_wav(
                        os.path.join(args.output, 'enhance_' + file_name),
                        pred_sources.T, args.sr)
                fname, ext = os.path.splitext(file_name)
                text = fname.split("_", 4)
                # Evaluate pesq
                enhance_pesq = pesq(target_sources, pred_sources, 16000)
                # Evaluate stoi
                enhance_stoi = stoi(target_sources,
                                    pred_sources,
                                    16000,
                                    extended=False)

                filename = os.path.basename(example['input'])
                noise_name = filename.split("_")[0]
                dB_noise_pesq[noise_name].append([enhance_pesq])

                dB_list_pesq[text[4]].append(enhance_pesq)
                dB_list_name_pesq[text[4]].append([enhance_pesq, filename])

                dB_list_stoi[text[4]].append(enhance_stoi)
                dB_list_name_stoi[text[4]].append([enhance_stoi, filename])
                pbar.update(1)

        dB_list_name_pesq['avg'] = 0
        dB_list_name_stoi['avg'] = 0
        num = len(dB_list_pesq)
        for key, value in dB_list_pesq.items():
            avg_pesq = np.mean(value, 0)
            dB_list_name_pesq[key].append([avg_pesq, "avg_pesq"])
            dB_list_name_pesq['avg'] += avg_pesq / num

        for key, value in dB_list_stoi.items():
            avg_stoi = np.mean(value, 0)
            dB_list_name_stoi[key].append([avg_stoi, "avg_stoi"])
            dB_list_name_stoi['avg'] += avg_stoi / num

        noise_avg = list()
        for key, value in dB_noise_pesq.items():
            avg_pesq = np.mean(value, 0)
            noise_avg.append([key, avg_pesq])

    print(noise_avg)
    pesq_avg = dB_list_name_pesq['avg']
    stoi_avg = dB_list_name_stoi['avg']
    print(f'pesq_avg:{pesq_avg} stoi_avg:{stoi_avg} ')
    return {
        'pesq': dB_list_name_pesq,
        'stoi': dB_list_name_stoi,
        'noise': noise_avg
    }
Beispiel #14
0
def train_fn(args):
    device = torch.device("cuda" if args.use_cuda else "cpu")
    upsample_factor = int(args.frame_shift_ms / 1000 * args.sample_rate)

    model = create_model(args)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    for state in optimizer.state.values():
        for key, value in state.items():
            if torch.is_tensor(value):
                state[key] = value.to(device)

    if args.resume is not None:
        print("Resume checkpoint from: {}:".format(args.resume))
        checkpoint = torch.load(args.resume,
                                map_location=lambda storage, loc: storage)
        if torch.cuda.device_count() > 1:
            model.module.load_state_dict(checkpoint['model'])
        else:
            model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint["optimizer"])
        global_step = checkpoint['steps']
    else:
        global_step = 0

    print("receptive field: {0} ({1:.2f}ms)".format(
        model.receptive_field,
        model.receptive_field / args.sample_rate * 1000))

    if args.feature_type == "mcc":
        # mfccs have already been scaled for Ryan
        # scaler = StandardScaler()
        # scaler.mean_ = np.load(os.path.join(args.data_dir, 'mean.npy'))
        # scaler.scale_ = np.load(os.path.join(args.data_dir, 'scale.npy'))
        # feat_transform = transforms.Compose([lambda x: scaler.transform(x)])
        feat_transform = None
    else:
        feat_transform = None

    dataset = FilterbankDataset(
        data_dir=args.data_dir,
        receptive_field=model.receptive_field,
        sample_size=args.sample_size,
        upsample_factor=upsample_factor,
        quantization_channels=args.quantization_channels,
        use_local_condition=args.use_local_condition,
        noise_injecting=args.noise_injecting,
        feat_transform=feat_transform)

    dataloader = DataLoader(dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=args.num_workers,
                            pin_memory=True)

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    criterion = nn.CrossEntropyLoss()

    ema = ExponentialMovingAverage(args.ema_decay)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)

    while global_step < args.training_steps:
        for i, data in enumerate(dataloader, 0):
            audio, target, local_condition = data
            target = target.squeeze(-1)
            local_condition = local_condition.transpose(1, 2)
            audio, target, h = audio.to(device), target.to(
                device), local_condition.to(device)

            optimizer.zero_grad()
            output = model(audio[:, :-1, :], h[:, :, 1:])
            loss = criterion(output, target)
            print('step [%3d]: loss: %.3f' % (global_step, loss.item()))

            loss.backward()
            optimizer.step()

            # update moving average
            if ema is not None:
                apply_moving_average(model, ema)

            global_step += 1

            if global_step % args.checkpoint_interval == 0:
                save_checkpoint(device, args, model, optimizer, global_step,
                                args.checkpoint_dir, ema)
                out = output[1, :, :]
                samples = out.argmax(0)
                waveform = mu_law_decode(
                    np.asarray(samples[model.receptive_field:]),
                    args.quantization_channels)
                write_wav(
                    waveform, args.sample_rate,
                    os.path.join(args.checkpoint_dir,
                                 "train_eval_{}.wav".format(global_step)))
Beispiel #15
0
def getMUSDB(database_path):
    mus = musdb.DB(root=database_path, is_wav=False)

    subsets = list()

    for subset in ["train", "test"]:
        tracks = mus.load_mus_tracks(subset)
        samples = list()

        # Go through tracks
        for track in sorted(tracks):
            # Skip track if mixture is already written, assuming this track is done already
            track_path = track.path[:-4]
            mix_path = track_path + "_mix.wav"
            acc_path = track_path + "_accompaniment.wav"
            if os.path.exists(mix_path):
                print("WARNING: Skipping track " + mix_path +
                      " since it exists already")

                # Add paths and then skip
                paths = {"mix": mix_path, "accompaniment": acc_path}
                paths.update({
                    key: track_path + "_" + key + ".wav"
                    for key in ["bass", "drums", "other", "vocals"]
                })

                samples.append(paths)

                continue

            rate = track.rate

            # Go through each instrument
            paths = dict()
            stem_audio = dict()
            for stem in ["bass", "drums", "other", "vocals"]:
                path = track_path + "_" + stem + ".wav"
                audio = track.targets[stem].audio
                write_wav(path, audio, rate)
                stem_audio[stem] = audio
                paths[stem] = path

            # Add other instruments to form accompaniment
            acc_audio = np.clip(
                sum([
                    stem_audio[key] for key in list(stem_audio.keys())
                    if key != "vocals"
                ]), -1.0, 1.0)
            write_wav(acc_path, acc_audio, rate)
            paths["accompaniment"] = acc_path

            # Create mixture
            mix_audio = track.audio
            write_wav(mix_path, mix_audio, rate)
            paths["mix"] = mix_path

            diff_signal = np.abs(mix_audio - acc_audio - stem_audio["vocals"])
            print(
                "Maximum absolute deviation from source additivity constraint: "
                + str(np.max(diff_signal)))  # Check if acc+vocals=mix
            print(
                "Mean absolute deviation from source additivity constraint:    "
                + str(np.mean(diff_signal)))

            samples.append(paths)

        subsets.append(samples)

    print("DONE preparing dataset!")
    return subsets
def reverberate_and_mix(out_folder, sources_folder, rir_folder,
                        mix_info, scale_rirs=10.0,
                        part=0, nparts=8, num_mics=1, chat=True,
                        output_align='causal'):
  """Reverberate and mix sources.

  Args:
    out_folder: Output folder to write reverberated sources and mixtures.
    sources_folder: Sources folder to read sources from.
    rir_folder: RIR folder to read rirs from.
    mix_info: A dictionary : mix_file_name -> (sources, rirs)
      where sources and rirs are paired lists of relative paths to source
      and rir signal wav files used in reverberate and mix operation to be
      performed.
    scale_rirs: A value to scale the RIR signals (float).
    part: Integer value indicating which part of parallel jobs to run (int).
    nparts: Number of parts considered for parallel runs (int).
    num_mics: Number of mics to use at the output (int).
    chat: If True, display more messages (bool).
    output_align: Output signal alignment type.
      'causal: Uses causal RIR filtering with no additional shift. '
      'align_sources': Find the average peak index of RIR(s) corresponding '
      '  each source and advance each source with that index. This has an '
      '  effect of aligning each source with their non-reverberated version.'
  Returns:
    None, but writes reverberated sources and mixtures into files.
  """
  list_mix = sorted(mix_info.keys())
  list_len = len(list_mix)
  partsize = list_len // nparts
  assert part < nparts
  start = part * partsize
  end = list_len if part == nparts-1 else (part + 1) * partsize
  if start == end:
    raise ValueError('Not enough mixtures to generate. Part {} of {} to '
                     'generate a total of {} mixtures.'.format(
                         part, nparts, list_len))
  print('Reverberating and mixing from {} to {} '
        'out of {}.'.format(start, end, list_len))
  for mix in list_mix[start:end]:
    sources, rirs = mix_info[mix]
    mix_to_data = []
    rir_peak_delays = []
    max_src_len = -1
    if chat:
      print('--\n{} ='.format(mix))
    for source, rir in zip(sources, rirs):
      source_path = os.path.join(sources_folder, source)
      src_data, samplerate_src = read_wav(source_path, always_2d=True)
      rir_path = os.path.join(rir_folder, rir)
      rir_data, samplerate_rir = read_wav(rir_path, always_2d=True)
      assert samplerate_src == samplerate_rir
      # Pick channel 0 of src_data.
      src_data = src_data[:, 0]
      # Pick num_mics channels of rirs and scale them.
      if len(rir_data.shape) == 2:
        rir_mics = np.shape(rir_data)[1]
        if rir_mics < num_mics:
          raise ValueError(f'The rir {rir_path} has only {rir_mics} channel '
                           f'data where the specified num_mics={num_mics}')
        rir_data = rir_data[:, :num_mics]
      else:
        if num_mics > 1:
          raise ValueError(f'The rir {rir_path} has only single channel data '
                           f'but specified num_mics={num_mics}')
        rir_data = np.reshape(rir_data, [-1, 1])
      rir_data = scale_rirs * rir_data
      rir_len = len(rir_data[:, 0])
      src_len = len(src_data)
      rir_max = np.max(np.abs(rir_data))
      rir_peaks = np.argmax(np.abs(rir_data), axis=0)
      src_max = np.max(np.abs(src_data))
      max_src_len = np.maximum(src_len, max_src_len)
      if chat:
        print('+ {} [{}, {:1.2f}] * {} [{}, {:1.2f}, {}]'.format(
            source, src_len, src_max, rir, rir_len, rir_max, rir_peaks))
      mix_to_data.append([src_data, rir_data, source, rir, rir_peaks])
    mix_rev_sources = []
    rir_paths_used = []
    for data in mix_to_data:
      src_data, rir_data, source_relpath, rir_relpath, rir_peaks = data
      rir_paths_used.append(rir_relpath)
      src_len = len(src_data)
      if src_len < max_src_len:
        print('WARNING: original source data has {} samples '
              'for source file {}, zero padding '
              'to size {}.'.format(src_len, source_relpath, max_src_len))
        src_data = np.concatenate((src_data, np.zeros(
            max_src_len - src_len)), axis=0)
      if output_align == 'align_sources':
        output_advance = np.round(np.mean(np.asarray(
            rir_peaks))).astype(np.int32)
      elif output_align == 'causal':
        output_advance = 0
      else:
        raise ValueError(f'Unknown output_align={output_align}')
      if chat and output_advance != 0:
        print(f'Source {source_relpath} advanced by {output_advance} samples.')
      rev_src_data = multimic_convolve(src_data, rir_data,
                                       output_advance=output_advance)
      # Write reverberated source data.
      rev_src_path = os.path.join(out_folder, source_relpath)
      os.makedirs(os.path.dirname(rev_src_path), exist_ok=True)
      write_wav(rev_src_path, rev_src_data, samplerate_src)
      mix_rev_sources.append(rev_src_data)
    mixed_rev_data = np.sum(np.stack(mix_rev_sources, axis=0), axis=0)
    mix_wav_path = os.path.join(out_folder, mix)
    mix_wav_base = mix_wav_path.rstrip('.wav')
    write_wav(mix_wav_path, mixed_rev_data, samplerate_src)
    in_wav_path = os.path.join(sources_folder, mix)
    in_wav_base = in_wav_path.rstrip('.wav')
    if os.path.exists(in_wav_base + '.jams'):
      shutil.copyfile(in_wav_base + '.jams', mix_wav_base + '.jams')
    if os.path.exists(in_wav_base + '.txt'):
      with open(in_wav_base + '.txt', 'r') as f:
        lines = f.readlines()
      with open(mix_wav_base + '.txt', 'w') as f:
        f.write(''.join(lines))
        f.write('\nroom impulse responses used:\n{}'.format(
            '\n'.join(rir_paths_used)))
Beispiel #17
0
def run(args):
    target_reader = WaveReader(args.target_spk)
    others_reader = [WaveReader(spk_scp) for spk_scp in args.disturb_spks]

    bg_noise_scp, fg_noise_scp = args.bg_noise, args.fg_noise
    bg_noise_reader = WaveReader(bg_noise_scp) if bg_noise_scp else None
    fg_noise_reader = WaveReader(fg_noise_scp) if fg_noise_scp else None

    # for each iteration
    for it in tqdm(range(args.iters)):
        # for each target utts
        for key, target in target_reader:
            noise = np.zeros_like(target)
            # add noise if exists
            for index, noise_reader in enumerate(
                [bg_noise_reader, fg_noise_reader]):
                if noise_reader:
                    # sample noise
                    # randint: [a, b]
                    noise_index = random.randint(0, len(noise_reader) - 1)
                    bg_or_fg_noise = noise_reader[noise_index]
                    # sample snr
                    snr = random.uniform(args.min_snr, args.max_snr)
                    # add noise
                    noise_seg = add_noise(target,
                                          bg_or_fg_noise,
                                          snr,
                                          period=(index == 0))
                    # accumulate noise
                    noise = noise + noise_seg

            if len(others_reader):
                # sample speaker
                num_samp_spk = random.randint(args.min_spk, args.max_spk)
                samp_reader = random.sample(others_reader, num_samp_spk)
                # for each interference speaker
                for spk_noise_reader in samp_reader:
                    # sample interference
                    utt_index = random.randint(0, len(spk_noise_reader) - 1)
                    spk_noise = spk_noise_reader[utt_index]
                    # sample sdr
                    sdr = random.uniform(args.min_sdr, args.max_sdr)
                    # add interference
                    noise_seg = add_noise(target, spk_noise, sdr)
                    # accumulate noise
                    noise = noise + noise_seg
            # sample norm
            sample_norm = random.uniform(0.6, 0.9)
            coef = sample_norm / np.maximum(np.linalg.norm(noise, np.inf),
                                            np.linalg.norm(target, np.inf))
            write_wav(
                os.path.join(args.target_dump_dir,
                             '{}_{:d}.wav'.format(key, it)), target * coef)
            write_wav(
                os.path.join(args.noise_dump_dir,
                             '{}_{:d}.wav'.format(key, it)), noise * coef)
            mixture = (target + noise) * coef
            mixture = sample_norm * mixture / np.linalg.norm(mixture, np.inf)
            write_wav(
                os.path.join(args.noisy_dump_dir,
                             '{}_{:d}.wav'.format(key, it)), mixture)
start = 0
end = 0

#fig = plt.figure(figsize=(15,4))
#imageCoordinate = 100 + 10*n_speakers_final + 1
#i = 0
#times = numpy.arange(len(cls))/float(fs)

for i in range(1, len(cls)):
    if cls[i] == cls[i - 1]:
        end = i
    else:
        newpath = "result_wav/" + audioname
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        audiofile = str(cls[i - 1]) + ":" + str(start * mt_step) + "-" + str(
            end * mt_step) + ".wav"
        write_wav(os.path.join(newpath, audiofile), fs,
                  signal[int(start * mt_step * fs):int(end * mt_step * fs)])

#

#next steps

#check whether speaker is known
#determine gmm for audiofile
#compare it with previous avaliable gmm models
#produce result
#c[i] = speaker number and name matched with gmm model
print(n_speakers_final, cls)
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--file',
                        type=str,
                        required=True,
                        help='Path to HRIR or HeSuVi file.')
    parser.add_argument(
        '--track_order',
        type=str,
        required=True,
        help='Track order in HRIR file. "hesuvi" or "hexadecagonal"')
    parser.add_argument(
        '--reverb',
        type=str,
        default=argparse.SUPPRESS,
        help=
        'Reverberation times for different channels in milliseconds. During this time the '
        'reverberation tail will be reduced by 100 dB. A comma separated list of channel name and '
        'reverberation time pairs, separated by colon. If only a single numeric value is given, '
        'it is used for all channels. When some channel names are give but not all, the missing '
        'channels are not affected. Must be at least 3 ms smaller than the HRIR length. '
        'For example "--reverb=300" or '
        '"--reverb=FL:500,FC:100,FR:500,SR:700,BR:700,BL:700,SL:700" or '
        '"--reverb=FC:100".')
    args = parser.parse_args()
    file_path = args.file
    track_order = args.track_order
    reverb = dict()
    try:
        # Single float value
        reverb = {ch: float(args.reverb) / 1000 for ch in SPEAKER_NAMES}
    except ValueError:
        # Channels separated
        for ch_t in args.reverb.split(','):
            reverb[ch_t.split(':')[0].upper()] = float(
                ch_t.split(':')[1]) / 1000

    fs, data = read_wav(file_path)

    for ch, t in reverb.items():
        print(f'{ch}: {t*1000:.0f}ms')
        n_ones = int(fs * 0.003)
        n_win = int(fs * t)
        win = np.concatenate([
            np.ones(n_ones),
            signal.windows.hann(n_win * 2)[n_win:],
            np.zeros(data.shape[1] - n_ones - n_win)
        ]) - 1.0
        win *= 100  # 100 dB
        win = 10**(win / 20)  # Linear scale
        if track_order == 'hesuvi':
            tracks = [
                i for i in range(len(HESUVI_TRACK_ORDER))
                if ch in HESUVI_TRACK_ORDER[i]
            ]
        elif track_order == 'hexadecagonal':
            tracks = [
                i for i in range(len(HEXADECAGONAL_TRACK_ORDER))
                if ch in HEXADECAGONAL_TRACK_ORDER[i]
            ]
        else:
            raise ValueError(
                f'Invalid track_order "{track_order}", allowed values are "hesuvi" and "hexadecagonal"'
            )
        for i in tracks:
            data[i, :] *= win

    # Write WAV
    write_wav(os.path.join(DIR_PATH, 'cropped.wav'), fs, data)
               [args.features*2**i for i in range(0, args.levels)]
target_outputs = int(args.output_size * args.sr)
model = Waveunet(args.channels,
                 num_features,
                 args.channels,
                 INSTRUMENTS,
                 kernel_size=args.kernel_size,
                 target_output_size=target_outputs,
                 depth=args.depth,
                 strides=args.strides,
                 conv_type=args.conv_type,
                 res=args.res,
                 separate=args.separate)

if args.cuda:
    model = utils.DataParallel(model)
    print("move model to gpu")
    model.cuda()

print("Loading model from checkpoint " + str(args.load_model))
state = utils.load_model(model, None, args.load_model)

preds = predict_song(args, args.input, model)

output_folder = os.path.dirname(
    args.input) if args.output is None else args.output
for inst in preds.keys():
    utils.write_wav(
        os.path.join(output_folder,
                     os.path.basename(args.input) + "_" + inst + ".wav"),
        preds[inst], args.sr)
Beispiel #21
0
def run(args):
    min_sdr, max_sdr = list(map(float, args.sdr.split(",")))
    wav_reader = WaveReader(args.wav_scp, sample_rate=args.fs)

    logger.info(
        "Start simulate {:d} utterances from {}, with sdr = {} ...".format(
            args.num_utts, args.wav_scp, args.sdr))
    statsf = open(args.simu_stats, "w") if args.simu_stats else None
    # 640 = 0.04 * 16000
    frame_shift = int(args.fs * args.shift)
    for _ in tqdm.trange(args.num_utts):
        # list of dict object
        min_dur, spks = sample_spks(wav_reader, args.num_spks, args.min_dur)

        mixture = np.zeros(min_dur)
        # treat first speaker as target
        ref_pow = spks[0]["pow"]
        ref_dur = spks[0]["dur"]
        ref_spk = spks[0]["wav"]

        stats = []
        # shift for target video
        shift = random.randint(0, (ref_dur - min_dur) // frame_shift)
        stats.append((spks[0]["key"], shift))
        # target segment
        segment = ref_spk[shift * frame_shift:shift * frame_shift + min_dur]
        mixture += segment
        # interference speakers
        sdrs = []
        infs = []
        for spk in spks[1:]:
            sdr_db = random.uniform(min_sdr, max_sdr)
            scaler = np.sqrt(ref_pow / spk["pow"] * 10**(-sdr_db / 10))
            # video shift
            shift = random.randint(0, (spk["dur"] - min_dur) // frame_shift)
            stats.append((spk["key"], shift))
            # mixture
            spkseg = spk["wav"][shift * frame_shift:shift * frame_shift +
                                min_dur]
            mixture += scaler * spkseg
            infs.append(scaler * spkseg)
            sdrs.append("{:+.2f}".format(sdr_db))

        uttid = "{0}_{1}".format("_".join([d["key"] for d in spks]),
                                 "_".join(sdrs))
        scaler = random.uniform(0.6, 0.9) / np.linalg.norm(mixture, np.inf)

        write_wav(os.path.join(args.dump_dir, "mix/{}.wav".format(uttid)),
                  mixture * scaler,
                  fs=args.fs)
        write_wav(os.path.join(args.dump_dir, "spk1/{}.wav".format(uttid)),
                  segment * scaler,
                  fs=args.fs)

        if not args.target_only:
            for idx, spk in enumerate(infs):
                write_wav(os.path.join(args.dump_dir,
                                       "spk{}/{}.wav".format(idx + 2, uttid)),
                          spk * scaler,
                          fs=args.fs)

        if statsf:
            record = uttid
            for pair in stats:
                record += " {0} {1}".format(pair[0], pair[1])
            statsf.write("{}\n".format(record))

    if statsf:
        statsf.close()
    logger.info(
        "Start simulate {:d} utterances from {}, with sdr = {} done".format(
            args.num_utts, args.wav_scp, args.sdr))
Beispiel #22
0
def reverberate_and_mix(out_folder,
                        sources_folder,
                        rir_folder,
                        mix_info,
                        scale_rirs=10.0,
                        part=0,
                        nparts=8,
                        num_mics=1,
                        chat=True):
    """Reverberate and mix sources."""
    list_mix = sorted(mix_info.keys())
    list_len = len(list_mix)
    partsize = list_len // nparts
    assert part < nparts
    start = part * partsize
    end = list_len if part == nparts - 1 else (part + 1) * partsize
    if start == end:
        raise ValueError('Not enough mixtures to generate. Part {} of {} to '
                         'generate a total of {} mixtures.'.format(
                             part, nparts, list_len))
    print('Reverberating and mixing from {} to {} '
          'out of {}.'.format(start, end, list_len))
    for mix in list_mix[start:end]:
        sources, rirs = mix_info[mix]
        mix_to_data = []
        max_src_len = -1
        if chat:
            print('--\n{} ='.format(mix))
        for source, rir in zip(sources, rirs):
            source_path = os.path.join(sources_folder, source)
            src_data, samplerate_src = read_wav(source_path, always_2d=True)
            rir_path = os.path.join(rir_folder, rir)
            rir_data, samplerate_rir = read_wav(rir_path, always_2d=True)
            assert samplerate_src == samplerate_rir
            # Pick channel 0 of src_data.
            src_data = src_data[:, 0]
            # Pick num_mics channels of rirs and scale them.
            if len(rir_data.shape) == 2:
                rir_mics = np.shape(rir_data)[1]
                if rir_mics < num_mics:
                    raise ValueError(
                        f'The rir {rir_path} has only {rir_mics} channel '
                        f'data but specified num_mics={num_mics}')
                rir_data = rir_data[:, :num_mics]
            else:
                if num_mics > 1:
                    raise ValueError(
                        f'The rir {rir_path} has only single channel data '
                        f'but specified num_mics={num_mics}')
                rir_data = np.reshape(rir_data, [-1, 1])
            rir_data = scale_rirs * rir_data
            rir_len = len(rir_data[:, 0])
            src_len = len(src_data)
            rir_max = np.max(np.abs(rir_data))
            src_max = np.max(np.abs(src_data))
            max_src_len = np.maximum(src_len, max_src_len)
            if chat:
                print('+ {} [{}, {:1.2f}] * {} [{}, {:1.2f}]'.format(
                    source, src_len, src_max, rir, rir_len, rir_max))
            mix_to_data.append([src_data, rir_data, source, rir])
        mix_rev_sources = []
        rir_paths_used = []
        for data in mix_to_data:
            src_data, rir_data, source_relpath, rir_relpath = data
            rir_paths_used.append(rir_relpath)
            src_len = len(src_data)
            if src_len < max_src_len:
                print('WARNING: original source data has {} samples '
                      'for source file {}, zero padding '
                      'to size {}.'.format(src_len, source_relpath,
                                           max_src_len))
                src_data = np.concatenate(
                    (src_data, np.zeros(max_src_len - src_len)), axis=0)
            rev_src_data = multimic_convolve(src_data, rir_data, 'same')
            # Write reverberated source data.
            rev_src_path = os.path.join(out_folder, source_relpath)
            os.makedirs(os.path.dirname(rev_src_path), exist_ok=True)
            write_wav(rev_src_path, rev_src_data, samplerate_src)
            mix_rev_sources.append(rev_src_data)
        mixed_rev_data = np.sum(np.stack(mix_rev_sources, axis=0), axis=0)
        mix_wav_path = os.path.join(out_folder, mix)
        mix_wav_base = mix_wav_path.rstrip('.wav')
        write_wav(mix_wav_path, mixed_rev_data, samplerate_src)
        in_wav_path = os.path.join(sources_folder, mix)
        in_wav_base = in_wav_path.rstrip('.wav')
        if os.path.exists(in_wav_base + '.jams'):
            shutil.copyfile(in_wav_base + '.jams', mix_wav_base + '.jams')
        if os.path.exists(in_wav_base + '.txt'):
            with open(in_wav_base + '.txt', 'r') as f:
                lines = f.readlines()
            with open(mix_wav_base + '.txt', 'w') as f:
                f.write(''.join(lines))
                f.write('\nroom impulse responses used:\n{}'.format(
                    '\n'.join(rir_paths_used)))
Beispiel #23
0
def wirteSignal(signal, filename):
    write_wav(signal, filename, sr=SAMPLE_RATE)
Beispiel #24
0
def wav_to_vad(wav_file, vad_file, sr=8000):
    audio, rate = librosa.load(wav_file, sr=sr)
    v = VoiceActivityDetector()
    write_wav(vad_file, v.get_speech(audio), rate)
Beispiel #25
0
def evaluate(args, dataset, model):
    dB_list_pesq = dict()
    dB_list_name_pesq = dict()
    dB_list_stoi = dict()
    dB_list_name_stoi = dict()
    dB_list_SISDR = dict()
    dB_list_name_SISDR = dict()
    if args.outside_test:
        for i in ['-7.5', '-2.5', '2.5', '7.5']:
            dB_list_pesq[i] = list()
            dB_list_name_pesq[i] = list()

            dB_list_stoi[i] = list()
            dB_list_name_stoi[i] = list()

            dB_list_SISDR[i] = list()
            dB_list_name_SISDR[i] = list()
            test_noise_file = "outside_test/noise"
    else:
        for i in ['-7.5', '-2.5', '2.5', '7.5']:
            dB_list_pesq[i] = list()
            dB_list_name_pesq[i] = list()

            dB_list_stoi[i] = list()
            dB_list_name_stoi[i] = list()

            dB_list_SISDR[i] = list()
            dB_list_name_SISDR[i] = list()
            test_noise_file = "test/noise"
    noise_dir = os.path.join(args.dataset_dir, test_noise_file)
    noise_file = os.listdir(noise_dir)
    dB_noise_pesq = {}
    for i in noise_file:
        dB_noise_pesq[os.path.splitext(i)[0]] = list()

    model.eval()
    with torch.no_grad():
        with tqdm(total=len(dataset)) as pbar:
            for example in dataset:
                # Load source references in their original sr and channel number
                input_data = nussl.AudioSignal(example['input'])
                target_data = nussl.AudioSignal(example['target'])

                # Predict using mixture
                pred_sources = predict_song(args, example["input"],
                                            model).flatten()

                file_name = os.path.basename(example['input'])

                utils.write_wav(
                    os.path.join(args.output, 'enhance_' + file_name),
                    pred_sources.T, args.sr)
                fname, ext = os.path.splitext(file_name)
                text = fname.split("_", 4)
                # Evaluate pesq
                input_sources = input_data.audio_data.flatten()
                target_sources = target_data.audio_data.flatten()

                input_pesq = pesq(target_sources, input_sources, 16000)
                enhance_pesq = pesq(target_sources, pred_sources, 16000)
                # Evaluate stoi
                input_stoi = stoi(target_sources,
                                  input_sources,
                                  16000,
                                  extended=False)
                enhance_stoi = stoi(target_sources,
                                    pred_sources,
                                    16000,
                                    extended=False)
                # scores[target_sources.path_to_input_file]['SI-SDR'][0]
                enhance_data = nussl.AudioSignal(audio_data_array=pred_sources,
                                                 sample_rate=16000)
                evaluator = nussl.evaluation.BSSEvalScale(
                    target_data, input_data)
                scores = evaluator.evaluate()
                input_SISDR = scores[
                    target_data.path_to_input_file]['SI-SDR'][0]
                evaluator = nussl.evaluation.BSSEvalScale(
                    target_data, enhance_data)
                scores = evaluator.evaluate()
                enhance_SISDR = scores[
                    target_data.path_to_input_file]['SI-SDR'][0]

                filename = os.path.basename(example['input'])
                noise_name = filename.split("_")[0]
                dB_noise_pesq[noise_name].append([
                    input_pesq, enhance_pesq, enhance_pesq - input_pesq,
                    enhance_SISDR, enhance_SISDR - input_SISDR
                ])

                dB_list_pesq[text[4]].append(
                    [input_pesq, enhance_pesq, enhance_pesq - input_pesq])
                dB_list_name_pesq[text[4]].append(
                    [[input_pesq, enhance_pesq, enhance_pesq - input_pesq],
                     file_name])

                dB_list_stoi[text[4]].append(
                    [input_stoi, enhance_stoi, enhance_stoi - input_stoi])
                dB_list_name_stoi[text[4]].append(
                    [[input_stoi, enhance_stoi, enhance_stoi - input_stoi],
                     file_name])

                dB_list_SISDR[text[4]].append(
                    [input_SISDR, enhance_SISDR, enhance_SISDR - input_SISDR])
                dB_list_name_SISDR[text[4]].append(
                    [[input_SISDR, enhance_SISDR, enhance_SISDR - input_SISDR],
                     file_name])
                pbar.update(1)
        num = len(dB_list_pesq)
        dB_list_name_pesq['avg'] = 0
        dB_list_name_stoi['avg'] = 0
        dB_list_name_SISDR['avg'] = 0
        improve_pesq = 0
        for key, value in dB_list_pesq.items():
            avg_pesq = np.mean(value, 0)
            pesq_list = [[avg_pesq[0], avg_pesq[1], avg_pesq[2]], "avg_pesq"]
            dB_list_name_pesq[key].append([pesq_list])
            dB_list_name_pesq['avg'] += avg_pesq[1] / num
            improve_pesq += avg_pesq[2] / num
        for key, value in dB_list_stoi.items():
            avg_stoi = np.mean(value, 0)
            stoi_list = [[avg_stoi[0], avg_stoi[1], avg_stoi[2]], "avg_stoi"]
            dB_list_name_stoi[key].append([stoi_list])
            dB_list_name_stoi['avg'] += avg_stoi[1] / num
        for key, value in dB_list_SISDR.items():
            avg_SISDR = np.mean(value, 0)
            SISDR_list = [[avg_SISDR[0], avg_SISDR[1], avg_SISDR[2]],
                          "avg_SISDR"]
            dB_list_name_SISDR[key].append([SISDR_list])
            dB_list_name_SISDR['avg'] += avg_SISDR[1] / num

        noise_avg = list()
        for key, value in dB_noise_pesq.items():
            avg_pesq = np.mean(value, 0)
            noise_avg.append([key, np.round(avg_pesq, decimals=3)])
            # if key==dB_noise_pesq.keys[-1]:
            #     print(noise_avg)
    print(noise_avg)
    dB_list_name_pesq['avg'] = round(dB_list_name_pesq['avg'], 3)
    dB_list_name_stoi['avg'] = round(dB_list_name_stoi['avg'], 3)
    dB_list_name_SISDR['avg'] = round(dB_list_name_SISDR['avg'], 3)
    pesq_avg = dB_list_name_pesq['avg']
    stoi_avg = dB_list_name_stoi['avg']
    SISDR_avg = dB_list_name_SISDR['avg']
    print(
        f'pesq_avg:{pesq_avg} stoi_avg:{stoi_avg} improve_pesq:{round(improve_pesq,3)} SISDR:{SISDR_avg} '
    )
    return {
        'pesq': dB_list_name_pesq,
        'stoi': dB_list_name_stoi,
        'SISDR': dB_list_name_SISDR,
        'noise': noise_avg
    }
def seg_ditail(fid, trained_model, mt_size, mt_step, st_win):
    """segment ditail"""
    st_step = st_win
    results = {} ###
    fs, signal = read_wav(fid)

    [_, st_features] = mt_feature_extraction(signal, fs, mt_size * fs, mt_step * fs,
                                             round(fs * st_win))

    # VAD:
    segments = vad(st_features, st_step, smooth_window=0.5, weight=0)
    i = 0
    delta_t = 0.4
    for seg in segments:
        if seg[1] - seg[0] > 2*delta_t:
            start_seg = seg[0]
            end_seg = seg[0] + delta_t
            while start_seg < end_seg:
                label = trained_model.predict(fs, signal[int(start_seg * fs):int(end_seg * fs)])
                print(fid, '--', [start_seg, end_seg], '->', label)
                # # ***********
                # write_wav(os.path.join(os.path.pardir, "result", "result_wav",
                #                        os.path.basename(fid)[:-3] + "-" + str(start_seg) + "-" +
                #                        str(end_seg) + "-" + label + ".wav"),
                #           fs, signal[int(start_seg * fs):int(end_seg * fs)])
                results[i] = {"label": label, "start": start_seg, "end": end_seg}
                i = i + 1
                start_seg = end_seg
                end_seg = start_seg + delta_t if start_seg + 2*delta_t < seg[1] else seg[1]
        else:
            label = trained_model.predict(fs, signal[int(seg[0] * fs):int(seg[1] * fs)])
            print(fid, '--', seg, '->', label)
            results[i] = {"label": label, "start": seg[0], "end": seg[1]}
            i = i + 1
            # # ***********
            # write_wav(os.path.join(os.path.pardir, "result", "result_wav",
            #                        os.path.basename(fid)[:-3] + "-" + str(last) + "-" +
            #                        str(seg[0]) + "-静音.wav"),
            #           fs, signal[int(last * fs):int(seg[0] * fs)])
            # write_wav(os.path.join(os.path.pardir, "result", "result_wav",
            #                        os.path.basename(fid)[:-3] + "-" + str(seg[0]) + "-" +
            #                        str(seg[1]) + "-" + label + ".wav"),
            #           fs, signal[int(seg[0] * fs):int(seg[1] * fs)])
            # last = seg[1]

    data = {"video_info": {}, "results": []}
    min_duration = 0.5
    start_seg = results[0]["start"]
    end_seg = results[0]["end"]
    label = results[0]["label"]
    # k = 0 ###
    # test = {}###
    # last = 0 ###
    for j in range(1, i-1):
        if results[j]["start"] - end_seg < min_duration \
                and results[j]["label"] == label:
            end_seg = results[j]["end"]
        else:
            if end_seg - start_seg >= 2*min_duration:
                data["results"].append({"start": start_seg, "end": end_seg, "speaker_id": label})
                # write_wav(os.path.join(os.path.pardir, "result", "result_wav",
                #                        str(int(k/10))+str(k%10)+os.path.basename(fid)[:-3] + "-" + str(last) + "-" +
                #                        str(start_seg) + "-静音.wav"),
                #           fs, signal[int(last * fs):int(start_seg *
                # write_wav(os.path.join(os.path.pardir, "result", "result_wav",
                #                        str(int(k / 10)) + str(k % 10) +os.path.basename(fid)[:-3] +
                #                        "-" + str(start_seg) + "-" +
                #                        str(end_seg) + "-" + label + ".wav"),
                #           fs, signal[int(start_seg * fs):int(end_seg * fs)])

            # if start_seg - last > 0.5:
            #     test[k] = {"label": "无人声", "start": last, "end": start_seg}  ###
            #     k = k + 1
            # if end_seg - start_seg > 0.5:
            #     test[k] = {"label": label, "start": start_seg, "end": end_seg}  ###
            #     k = k + 1
            #     last = end_seg
            start_seg = results[j]["start"]
            end_seg = results[j]["end"]
            label = results[j]["label"]

    # test[k] = {"start": start_seg, "end": end_seg, "label": label}
    # with open("D:\\pro_file\\untitled\\Amber_SpeechSeparation\\test\\result_wav\\example.json",
    #           'w', encoding='utf-8') as fid_exam:
    #     json.dump(test, fid_exam, ensure_ascii=False)
    data["results"].append({"start": start_seg, "end": end_seg, "speaker_id": label})
    write_wav(os.path.join(os.path.pardir, "result", "result_wav",
                           os.path.basename(fid)[:-4] + "-" + str(start_seg) + "-" +
                           str(end_seg) + "-" + label + ".wav"),
              fs, signal[int(start_seg * fs):int(end_seg * fs)])

    with open(os.path.join(os.path.pardir, "result", "test_json",
                           os.path.basename(fid)[:-3] + "json"),
              'w', encoding='utf-8') as json_file:
        print("..\\result\\test_json\\" + os.path.basename(fid)[:-3] + "json->Generated")
        json.dump(data, json_file, ensure_ascii=False)