def __eq__(self, other): """Return whether two objects are equal. """ if not AudioSegment.__eq__(self, other): return False if self._transcript != other._transcript: return False return True
def transform_audio(self, audio_segment): """Add impulse response effect. Note that this is an in-place transformation. :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ impulse_json = self._rng.sample(self._impulse_manifest, 1)[0] impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath']) audio_segment.convolve(impulse_segment, allow_resample=True)
def from_file(cls, filepath, transcript): """Create speech segment from audio file and corresponding transcript. :param filepath: Filepath or file object to audio file. :type filepath: basestring|file :param transcript: Transcript text for the speech. :type transript: basestring :return: Speech segment instance. :rtype: SpeechSegment """ audio = AudioSegment.from_file(filepath) return cls(audio.samples, audio.sample_rate, transcript)
def from_file(cls, filepath, transcript): """从音频文件和相应的文本创建语音片段 :param filepath: 音频文件路径 :type filepath: str|file :param transcript: 音频文件对应的文本 :type transript: str :return: Speech segment instance. :rtype: SpeechSegment """ audio = AudioSegment.from_file(filepath) return cls(audio.samples, audio.sample_rate, transcript)
def make_silence(cls, duration, sample_rate): """创建指定安静音频长度和采样率的SpeechSegment实例,音频文件对应的文本将为空字符串。 :param duration: 安静音频的时间,单位秒 :type duration: float :param sample_rate: 音频采样率 :type sample_rate: float :return: 安静音频SpeechSegment实例 :rtype: SpeechSegment """ audio = AudioSegment.make_silence(duration, sample_rate) return cls(audio.samples, audio.sample_rate, "")
def _compute_mean_std(self, manifest_path, featurize_func, num_samples): """从随机抽样的实例中计算均值和标准值""" manifest = read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in tqdm(sampled_manifest): features.append( featurize_func( AudioSegment.from_file(instance["audio_filepath"]))) features = np.hstack(features) self._mean = np.mean(features, axis=1).reshape([-1, 1]) self._std = np.std(features, axis=1).reshape([-1, 1])
def _compute_mean_std(self, manifest_path, featurize_func, num_samples): """Compute mean and std from randomly sampled instances.""" manifest = read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in sampled_manifest: features.append( featurize_func( AudioSegment.from_file(instance["audio_filepath"]))) features = np.hstack(features) self._mean = np.mean(features, axis=1).reshape([-1, 1]) self._std = np.std(features, axis=1).reshape([-1, 1])
def from_bytes(cls, bytes, transcript): """从字节串和相应的文本创建语音片段 :param bytes: 包含音频样本的字节字符串 :type bytes: str :param transcript: 音频文件对应的文本 :type transript: str :return: Speech segment instance. :rtype: Speech Segment """ audio = AudioSegment.from_bytes(bytes) return cls(audio.samples, audio.sample_rate, transcript)
def make_silence(cls, duration, sample_rate): """Creates a silent speech segment of the given duration and sample rate, transcript will be an empty string. :param duration: Length of silence in seconds. :type duration: float :param sample_rate: Sample rate. :type sample_rate: float :return: Silence of the given duration. :rtype: SpeechSegment """ audio = AudioSegment.make_silence(duration, sample_rate) return cls(audio.samples, audio.sample_rate, "")
def from_bytes(cls, bytes, transcript): """Create speech segment from a byte string and corresponding transcript. :param bytes: Byte string containing audio samples. :type bytes: str :param transcript: Transcript text for the speech. :type transript: basestring :return: Speech segment instance. :rtype: Speech Segment """ audio = AudioSegment.from_bytes(bytes) return cls(audio.samples, audio.sample_rate, transcript)
def from_bytes(cls, bytes, transcript, **soundfile_options): """Create speech segment from a byte string and corresponding transcript. :param bytes: Byte string containing audio samples. :type bytes: str :param transcript: Transcript text for the speech. :type transript: basestring :param soundfile_options: Options for opening with soundfile library. :type soundfile_options: **kwargs :return: Speech segment instance. :rtype: Speech Segment """ audio = AudioSegment.from_bytes(bytes, **soundfile_options) return cls(audio.samples, audio.sample_rate, transcript)
def slice_from_file(cls, filepath, transcript, start=None, end=None): """只加载一小部分SpeechSegment,而不需要将整个文件加载到内存中,这是非常浪费的。 :param filepath:文件路径或文件对象到音频文件 :type filepath: str|file :param start: 开始时间,单位为秒。如果start是负的,则它从末尾开始计算。如果没有提供,这个函数将从最开始读取。 :type start: float :param end: 结束时间,单位为秒。如果end是负的,则它从末尾开始计算。如果没有提供,默认的行为是读取到文件的末尾。 :type end: float :param transcript: 音频文件对应的文本,如果没有提供,默认值是一个空字符串。 :type transript: str :return: SpeechSegment实例 :rtype: SpeechSegment """ audio = AudioSegment.slice_from_file(filepath, start, end) return cls(audio.samples, audio.sample_rate, transcript)
def transform_audio(self, audio_segment): """Add background noise audio. Note that this is an in-place transformation. :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ noise_json = self._rng.sample(self._noise_manifest, 1)[0] if noise_json['duration'] >= audio_segment.duration: diff_duration = noise_json['duration'] - audio_segment.duration start = self._rng.uniform(0, diff_duration) end = start + audio_segment.duration noise_segment = AudioSegment.slice_from_file( noise_json['audio_filepath'], start=start, end=end) snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) audio_segment.add_noise(noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
def slice_from_file(cls, filepath, transcript, start=None, end=None): """Loads a small section of an speech without having to load the entire file into the memory which can be incredibly wasteful. :param filepath: Filepath or file object to audio file. :type filepath: basestring|file :param start: Start time in seconds. If start is negative, it wraps around from the end. If not provided, this function reads from the very beginning. :type start: float :param end: End time in seconds. If end is negative, it wraps around from the end. If not provided, the default behvaior is to read to the end of the file. :type end: float :param transcript: Transcript text for the speech. if not provided, the defaults is an empty string. :type transript: basestring :return: SpeechSegment instance of the specified slice of the input speech file. :rtype: SpeechSegment """ audio = AudioSegment.slice_from_file(filepath, start, end) return cls(audio.samples, audio.sample_rate, transcript)
def __getitem__(self, idx): instance = self.sampled_manifest[idx] # 获取音频特征 audio = AudioSegment.from_file(instance["audio_filepath"]) feature = self.audio_featurizer.featurize(audio) return feature, 0
speed = random.randint(12, 12) / 10. outputfile = path + '/' + name.split('.')[0] + "-" + str( speed) + "-" + 'work.wav' # audio = AudioSegment.from_file(w) # audio.change_speed(speed) # audio.to_wav_file(outputfile) # # outputfile1 = path + '/' + name.split('.')[0] + "-" + str(speed) + "-" + 'work1.wav' # audio, sr1 = sf.read(w, dtype='float32') # result_a = librosa.effects.time_stretch(audio, speed) # librosa.output.write_wav(outputfile1, result_a, sr1) noise_percent = 1 seq_length = 600 audio, sr1 = sf.read(w, dtype='float32') if random.random() < noise_percent and seq_length > 0: temp_audio = AudioSegment(audio, sr1) audio_length = audio.shape[0] max_length_ratio = int( (float(audio_length) / (seq_length - 100) / sr1) * 10000) min_length_ratio = int( np.math.ceil((float(audio_length) / seq_length / sr1) * 10000)) temp_audio.change_speed( random.randint(min_length_ratio, max_length_ratio) / 100.) audio = temp_audio.samples if random.random() < noise_percent: if seq_length != -1: max_length = seq_length * sr1 / 100 if audio.shape[0] < max_length: bg = np.zeros((max_length, )) rand_start = random.randint(0, max_length - audio.shape[0]) bg[rand_start:rand_start + audio.shape[0]] = audio
def __init__(self, samples, sample_rate, transcript): AudioSegment.__init__(self, samples, sample_rate) self._transcript = transcript