def __eq__(self, other):
     """Return whether two objects are equal.
     """
     if not AudioSegment.__eq__(self, other):
         return False
     if self._transcript != other._transcript:
         return False
     return True
    def transform_audio(self, audio_segment):
        """Add impulse response effect.

        Note that this is an in-place transformation.

        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
        audio_segment.convolve(impulse_segment, allow_resample=True)
 def from_file(cls, filepath, transcript):
     """Create speech segment from audio file and corresponding transcript.
     
     :param filepath: Filepath or file object to audio file.
     :type filepath: basestring|file
     :param transcript: Transcript text for the speech.
     :type transript: basestring
     :return: Speech segment instance.
     :rtype: SpeechSegment
     """
     audio = AudioSegment.from_file(filepath)
     return cls(audio.samples, audio.sample_rate, transcript)
    def from_file(cls, filepath, transcript):
        """从音频文件和相应的文本创建语音片段

        :param filepath: 音频文件路径
        :type filepath: str|file
        :param transcript: 音频文件对应的文本
        :type transript: str
        :return: Speech segment instance.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.from_file(filepath)
        return cls(audio.samples, audio.sample_rate, transcript)
    def make_silence(cls, duration, sample_rate):
        """创建指定安静音频长度和采样率的SpeechSegment实例,音频文件对应的文本将为空字符串。

        :param duration: 安静音频的时间,单位秒
        :type duration: float
        :param sample_rate: 音频采样率
        :type sample_rate: float
        :return: 安静音频SpeechSegment实例
        :rtype: SpeechSegment
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
Esempio n. 6
0
 def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
     """从随机抽样的实例中计算均值和标准值"""
     manifest = read_manifest(manifest_path)
     sampled_manifest = self._rng.sample(manifest, num_samples)
     features = []
     for instance in tqdm(sampled_manifest):
         features.append(
             featurize_func(
                 AudioSegment.from_file(instance["audio_filepath"])))
     features = np.hstack(features)
     self._mean = np.mean(features, axis=1).reshape([-1, 1])
     self._std = np.std(features, axis=1).reshape([-1, 1])
Esempio n. 7
0
 def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
     """Compute mean and std from randomly sampled instances."""
     manifest = read_manifest(manifest_path)
     sampled_manifest = self._rng.sample(manifest, num_samples)
     features = []
     for instance in sampled_manifest:
         features.append(
             featurize_func(
                 AudioSegment.from_file(instance["audio_filepath"])))
     features = np.hstack(features)
     self._mean = np.mean(features, axis=1).reshape([-1, 1])
     self._std = np.std(features, axis=1).reshape([-1, 1])
    def from_bytes(cls, bytes, transcript):
        """从字节串和相应的文本创建语音片段

        :param bytes: 包含音频样本的字节字符串
        :type bytes: str
        :param transcript: 音频文件对应的文本
        :type transript: str
        :return: Speech segment instance.
        :rtype: Speech Segment
        """
        audio = AudioSegment.from_bytes(bytes)
        return cls(audio.samples, audio.sample_rate, transcript)
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
        sample rate, transcript will be an empty string.

        :param duration: Length of silence in seconds.
        :type duration: float
        :param sample_rate: Sample rate.
        :type sample_rate: float
        :return: Silence of the given duration.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
 def from_bytes(cls, bytes, transcript):
     """Create speech segment from a byte string and corresponding
     transcript.
     
     :param bytes: Byte string containing audio samples.
     :type bytes: str
     :param transcript: Transcript text for the speech.
     :type transript: basestring
     :return: Speech segment instance.
     :rtype: Speech Segment
     """
     audio = AudioSegment.from_bytes(bytes)
     return cls(audio.samples, audio.sample_rate, transcript)
Esempio n. 11
0
 def from_bytes(cls, bytes, transcript, **soundfile_options):
     """Create speech segment from a byte string and corresponding
     transcript.
     
     :param bytes: Byte string containing audio samples.
     :type bytes: str
     :param transcript: Transcript text for the speech.
     :type transript: basestring
     :param soundfile_options: Options for opening with soundfile library.
     :type soundfile_options: **kwargs
     :return: Speech segment instance.
     :rtype: Speech Segment
     """
     audio = AudioSegment.from_bytes(bytes, **soundfile_options)
     return cls(audio.samples, audio.sample_rate, transcript)
Esempio n. 12
0
    def slice_from_file(cls, filepath, transcript, start=None, end=None):
        """只加载一小部分SpeechSegment,而不需要将整个文件加载到内存中,这是非常浪费的。

        :param filepath:文件路径或文件对象到音频文件
        :type filepath: str|file
        :param start: 开始时间,单位为秒。如果start是负的,则它从末尾开始计算。如果没有提供,这个函数将从最开始读取。
        :type start: float
        :param end: 结束时间,单位为秒。如果end是负的,则它从末尾开始计算。如果没有提供,默认的行为是读取到文件的末尾。
        :type end: float
        :param transcript: 音频文件对应的文本,如果没有提供,默认值是一个空字符串。
        :type transript: str
        :return: SpeechSegment实例
        :rtype: SpeechSegment
        """
        audio = AudioSegment.slice_from_file(filepath, start, end)
        return cls(audio.samples, audio.sample_rate, transcript)
    def transform_audio(self, audio_segment):
        """Add background noise audio.

        Note that this is an in-place transformation.

        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        noise_json = self._rng.sample(self._noise_manifest, 1)[0]
        if noise_json['duration'] >= audio_segment.duration:
            diff_duration = noise_json['duration'] - audio_segment.duration
            start = self._rng.uniform(0, diff_duration)
            end = start + audio_segment.duration
            noise_segment = AudioSegment.slice_from_file(
                noise_json['audio_filepath'], start=start, end=end)
            snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
            audio_segment.add_noise(noise_segment,
                                    snr_dB,
                                    allow_downsampling=True,
                                    rng=self._rng)
    def slice_from_file(cls, filepath, transcript, start=None, end=None):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.

        :param filepath: Filepath or file object to audio file.
        :type filepath: basestring|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :param transcript: Transcript text for the speech. if not provided, 
                           the defaults is an empty string.
        :type transript: basestring
        :return: SpeechSegment instance of the specified slice of the input
                 speech file.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.slice_from_file(filepath, start, end)
        return cls(audio.samples, audio.sample_rate, transcript)
 def __getitem__(self, idx):
     instance = self.sampled_manifest[idx]
     # 获取音频特征
     audio = AudioSegment.from_file(instance["audio_filepath"])
     feature = self.audio_featurizer.featurize(audio)
     return feature, 0
Esempio n. 16
0
 speed = random.randint(12, 12) / 10.
 outputfile = path + '/' + name.split('.')[0] + "-" + str(
     speed) + "-" + 'work.wav'
 # audio = AudioSegment.from_file(w)
 # audio.change_speed(speed)
 # audio.to_wav_file(outputfile)
 #
 # outputfile1 = path + '/' + name.split('.')[0] + "-" + str(speed) + "-" + 'work1.wav'
 # audio, sr1 = sf.read(w, dtype='float32')
 # result_a = librosa.effects.time_stretch(audio, speed)
 # librosa.output.write_wav(outputfile1, result_a, sr1)
 noise_percent = 1
 seq_length = 600
 audio, sr1 = sf.read(w, dtype='float32')
 if random.random() < noise_percent and seq_length > 0:
     temp_audio = AudioSegment(audio, sr1)
     audio_length = audio.shape[0]
     max_length_ratio = int(
         (float(audio_length) / (seq_length - 100) / sr1) * 10000)
     min_length_ratio = int(
         np.math.ceil((float(audio_length) / seq_length / sr1) * 10000))
     temp_audio.change_speed(
         random.randint(min_length_ratio, max_length_ratio) / 100.)
     audio = temp_audio.samples
 if random.random() < noise_percent:
     if seq_length != -1:
         max_length = seq_length * sr1 / 100
         if audio.shape[0] < max_length:
             bg = np.zeros((max_length, ))
             rand_start = random.randint(0, max_length - audio.shape[0])
             bg[rand_start:rand_start + audio.shape[0]] = audio
 def __init__(self, samples, sample_rate, transcript):
     AudioSegment.__init__(self, samples, sample_rate)
     self._transcript = transcript