def downmix(src, dst): rand = randstr(20) tfm = sox.Transformer() tfm.remix({1:[1]}) l1 = "/tmp/%s_l1.ogg" % rand tfm.build(src, l1) tfm = sox.Transformer() tfm.remix({1:[2]}) c1 = "/tmp/%s_c1.ogg" % rand tfm.gain(-10.0) tfm.build(src, c1) tfm = sox.Transformer() tfm.remix({1:[3]}) r1 = "/tmp/%s_r1.ogg" % rand tfm.build(src, r1) cbn = sox.Combiner() l2 = "/tmp/%s_l2.ogg" % rand cbn.build([l1,c1], l2, 'mix') cbn = sox.Combiner() r2 = "/tmp/%s_r2.ogg" % rand cbn.build([r1,c1], r2, 'mix') cbn = sox.Combiner() cbn.build([l2,r2], dst, 'merge')
def _trim_tracks_as_wav(extracted_tracks, trimlist, framerate, framenum, offset_time, silent): try: import sox except ModuleNotFoundError: raise ModuleNotFoundError( 'AudioProcessor.VideoSource: missing sox dependency for trimming.') framerate = Fraction(framerate) SPF = float(1.0 / framerate) trimfiles = [] temp_outfiles = [] for track in extracted_tracks: out_path_prefix = os.path.splitext(track)[0] outfile = f"{out_path_prefix}_cut.wav" trimfiles.append(outfile) if type(trimlist[0]) is list and len(trimlist) > 1: for index, trim in enumerate(trimlist, start=1): temp_outfile = f"{out_path_prefix}_temp{index}.wav" temp_outfiles.append(temp_outfile) _sox_trim(track, temp_outfile, trim, framenum, offset_time, SPF, silent) cbn = sox.Combiner() if silent: cbn.set_globals(verbosity=0) formats = ['wav' for file in temp_outfiles] cbn.set_input_format(file_type=formats) cbn.build(temp_outfiles, outfile, 'concatenate') elif type(trimlist[0]) is int or type(trimlist[0]) is type(None): _sox_trim(track, outfile, trimlist, framenum, offset_time, SPF, silent) return trimfiles, temp_outfiles
def checker_track(self, output_file_name, gap=1.0, repeat_count=5, mute_first=False, mute_last=False): """Repeat the sample on alternating tracks so the fade in and out can overlap""" track_a_file = self.temp_folder + 'track-a.wav' track_b_file = self.temp_folder + 'track-b.wav' half, remainder = divmod(repeat_count, 2) track_a_repeat_count = half + remainder - 1 track_b_repeat_count = half - 1 if mute_last: if remainder: # there are an odd number of repeats, so the muted last repetition is in track A self.make_track(track_a_file, gap, track_a_repeat_count, mute_last=mute_last) self.make_track(track_b_file, gap, track_b_repeat_count, has_initial_rest=True) else: # there are an even number of repeats, so the muted last repetition is in track B self.make_track(track_a_file, gap, track_a_repeat_count) self.make_track(track_b_file, gap, track_b_repeat_count, has_initial_rest=True, mute_last=mute_last) else: self.make_track(track_a_file, gap, track_a_repeat_count, mute_first=mute_first) self.make_track(track_b_file, gap, track_b_repeat_count, has_initial_rest=True) cbn = sox.Combiner() cbn.build([track_a_file, track_b_file], output_file_name, 'mix-power')
def find_music(audio_file): modelName = "pyAA/data/svmSM" [Fs, x] = aIO.readAudioFile(audio_file) duration = x.shape[0] / float(Fs) t1 = time.clock() flagsInd, classNames, acc, CMt = aS.mtFileClassification( audio_file, modelName, "svm", False, '') [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) t2 = time.clock() perTime1 = duration / (t2 - t1) flags = [classNames[int(f)] for f in flagsInd] (segs, classes) = aS.flags2segs(flags, mtStep) i = 0 #len(classes)-1 file_parts = [] cbn = sox.Combiner() if len(classes) > 1: for c in classes: if c == 'music': start = segs[i][0] if i != 0: start -= 0.5 end = segs[i][1] if i != len(classes) - 1: end += 2.5 file_parts.append((int(start * 1000), int(end * 1000))) i += 1 return file_parts
def convert_audios(settings,path,pathto,vadfactor=[],trimmfactor=[]): try: if len(path) > 1: tools.printer(2,'combining','') cbn = sox.Combiner() if settings[0] == 'mp3': cbn.set_output_format(file_type=settings[0],rate=settings[1],channels=settings[2]) else: cbn.set_output_format(file_type=settings[0],rate=settings[1],channels=settings[2],bits=settings[3],encoding=settings[4]) cbn.convert() cbn.build( path, pathto, 'concatenate' ) else: tfm = sox.Transformer() if settings[0] == 'mp3': tfm.set_output_format(file_type=settings[0],rate=settings[1],channels=settings[2]) else: tfm.set_output_format(file_type=settings[0],rate=settings[1],channels=settings[2],bits=settings[3],encoding=settings[4]) if len(trimmfactor) > 1: tfm.trim(trimmfactor[0],trimmfactor[1]) if len(vadfactor) > 1: tfm.vad(initial_pad=vadfactor[0]) tfm.vad(location=-1, initial_pad=vadfactor[1]) tfm.convert() tfm.build(path[0], pathto) return True except: return False
def generate_full_mp3(solo_parts: List[Part]) -> None: combiner = sox.Combiner() # docs https://pysox.readthedocs.io/en/latest/api.html input_files = [part.mp3_filepath() for part in solo_parts] output_file_path = "{}/all.mp3".format(args.output) combiner.build(input_files, output_file_path, 'mix-power')
def compose(input_files, output_path, samplerate=22000, n_channels=2, file_extension=".mp3"): cbn = sox.Combiner() cbn.convert(samplerate=samplerate, n_channels=n_channels) cbn.build(input_files, output_path, 'concatenate') return output_path
def addToPlaylist(self, f): self.totalSong = len(f[0]) for i in range(len(f[0])): tfm1 = sox.Transformer() sox_duration = sox.file_info.duration(f[0][i]) tfm1.fade(fade_in_len=5) tfm1.trim(0, sox_duration - 17) tfm1.build_file(f[0][i], 'C:\\Users\\Fadli\\Downloads\\Part1.wav') tfm2 = sox.Transformer() tfm2.trim(sox_duration - 17, sox_duration - 16.5) tfm2.bass(-5) tfm2.build_file(f[0][i], 'C:\\Users\\Fadli\\Downloads\\Part2.wav') tfm3 = sox.Transformer() tfm3.trim(sox_duration - 16.5, sox_duration - 16) tfm3.bass(-15) tfm3.build_file(f[0][i], 'C:\\Users\\Fadli\\Downloads\\Part3.wav') tfm4 = sox.Transformer() tfm4.trim(sox_duration - 16, sox_duration) tfm4.fade(fade_out_len=16) tfm4.bass(-35) tfm4.build_file(f[0][i], 'C:\\Users\\Fadli\\Downloads\\Part4.wav') sox_output, _ = f[0][i].rsplit('.', 1) sox_output += '.wav' cbn = sox.Combiner() cbn.build([ 'C:\\Users\\Fadli\\Downloads\\Part1.wav', 'C:\\Users\\Fadli\\Downloads\\Part2.wav', 'C:\\Users\\Fadli\\Downloads\\Part3.wav', 'C:\\Users\\Fadli\\Downloads\\Part4.wav' ], sox_output, 'concatenate') tempSongname = QUrl.fromLocalFile(f[0][i]) tempSongname, _ = tempSongname.fileName().rsplit('.', 1) proc = madmom.features.beats.DBNBeatTrackingProcessor(fps=100) act = madmom.features.beats.RNNBeatProcessor( online=True, nn_files=[BEATS_LSTM[0]])(sox_output) beatTimes = proc(act) beatAvg = 0 for j in range(len(beatTimes) - 1): beatAvg += 60 / (beatTimes[j + 1] - beatTimes[j]) tempo = round(beatAvg / len(beatTimes)) self.ui.songList.insertItem(self.index, tempSongname) self.ui.songList.setCurrentRow(0) self.ui.songList2.insertItem(self.index, tempSongname) self.ui.songList2.setCurrentRow(0) self.playlist[self.index].append(sox_output) self.playlist[self.index].append(tempo) self.playlist[self.index].append(beatTimes) self.playlist.append([]) print(self.playlist) self.index += 1 self.totalSong -= 1
def get_array(self) -> AudioArray: combiner = sox.Combiner() input_files = [input.get_temp_file() for input in self.__inputs] combiner.build(input_filepath_list=input_files, output_filepath=self._temp_filepath, combine_type=self.__combine_type) return AudioArray(array=sox.Transformer().build_array( input_filepath=self._temp_filepath), sample_rate=sox.file_info.sample_rate( input_filepath=self._temp_filepath))
def write_chords(paths, annotations, write=False, strategy = 'powers'): """ Building chords, triads, septachord notes -> chords """ print ('Generating chords of class "{}"'.format(strategy)) strategy_config = yaml.load(open(paths.get('strategies').get(strategy), 'r')) annotations_df = pd.read_csv(annotations, index_col = 0) annotations_df['pitch'] = pd.to_numeric(annotations_df['pitch']) records = annotations_df.set_index(['guitarModel', 'pitch'])['audioFileName'].to_dict() directories = [] file_ticker = 0 history = {} for ii in annotations_df[['guitarModel', 'pitch', 'audioFileName']].itertuples(): pitch = int(ii.pitch) model = ii.guitarModel if model not in history: history[model] = 0 audioname = ii.audioFileName.split('/')[-1].strip('.wav') subdirectory = os.path.join(paths.get('interim').trace, strategy, model, '') if model not in directories: os.makedirs(subdirectory, exist_ok = True) directories.append(model) for chord, segment in strategy_config.items(): for s, pitch_components in segment.items(): [*bindings], [*components] = zip(*[(records.get((model, pitch + x)), str(pitch + x)) for x in pitch_components]) if any([fn is None for fn in bindings]) is False: history[model] += 1 rename = '_'.join([audioname, chord, s] + components) + '.wav' rename = os.path.join(subdirectory, rename) if write is True: combiner = sox.Combiner() # BINDINGS = OTHER TRACKS MIXING combiner.build(bindings, rename, 'mix') file_ticker += 1 else: pass else: break print('----- {} GENERATED BY MODEl -----'.format(strategy.upper())) pprint (history)
def mix_mono(outputname, *inputname) -> None: outputname = "{0}.wav".format(outputname) inputname = tuple("{0}.wav".format(n) for n in inputname) size = len(inputname) if size > 1: cbn = sox.Combiner() cbn.build(list(inputname), outputname, "merge") elif size == 1: shutil.copyfile(inputname[0], outputname) else: with open(outputname, "w") as f: f.write("")
def phase(self, output_file_name=None, n_tracks=9, gap=.03, repeat_count=20, end_align=False): if output_file_name == None: output_file_name = self.output_file_name track_file_names = [] for i in range(1, n_tracks + 1): track_file_name = self.temp_folder + 'track-{}.wav'.format(i) track_file_names.append(track_file_name) mute_first = False if not end_align and i is not 1: mute_first = True mute_last = False if end_align and i is not n_tracks: mute_last = True self.checker_track(track_file_name, gap=gap * i, repeat_count=repeat_count, mute_first=mute_first, mute_last=mute_last) if end_align: track_durations = [ sox.file_info.duration(f) for f in track_file_names ] longest_track_duration = max(track_durations) track_duration_diffs = [ longest_track_duration - d for d in track_durations ] new_track_file_names = [] for i, diff, track_file_name in zip(range(1, n_tracks + 1), track_duration_diffs, track_file_names): new_track_file_name = track_file_name[:-4] + '-start-offset.wav' new_track_file_names.append(new_track_file_name) tfm = sox.Transformer() tfm.pad(start_duration=diff + (gap * i)) tfm.build(track_file_name, new_track_file_name) track_file_names = new_track_file_names cbn = sox.Combiner() cbn.silence(location=1) # Remove silence from the beginning cbn.silence(location=-1) # Remove silence from the end cbn.build(track_file_names, output_file_name, 'mix-power')
def before_request() -> None: g.request_name = request.path g.params = '()' g.error = False g.path_to_files = [] g.transformer = sox.Transformer() g.combiner = sox.Combiner() total_size = 0 try: total_size = sum([os.path.getsize(INPUT_DIRECTORY + file) if os.path.exists(INPUT_DIRECTORY + file) else 0 for file in os.listdir(INPUT_DIRECTORY)]) + \ sum([os.path.getsize(OUTPUT_DIRECTORY + file) if os.path.exists(OUTPUT_DIRECTORY + file) else 0 for file in os.listdir(OUTPUT_DIRECTORY)]) except Exception as e: print(e) g.access = total_size < STORAGE_LIMIT
def mix_multitrack(mtrack, output_path, stem_indices=None, alternate_weights=None, alternate_files=None, additional_files=None): """Mix the stems of a multitrack to create a new mix. Can optionally adjust the volume of stems and replace, remove, or add stems. Parameters ---------- mtrack : Multitrack Multitrack object output_path : str Path to save output file. stem_indices : list or None, default=None stem indices to include in mix. If None, mixes all stems alternate_weights : dict or None, default=None Dictionary with stem indices as keys and mixing coefficients as values. Stem indices present that are not in this dictionary will use the default estimated mixing coefficient. alternate_files : dict or None, default=None Dictionary with stem indices as keys and filepaths as values. Audio file to use in place of original stem. Stem indices present that are not in this dictionary will use the original stems. additional_files : list of tuple or None, default=None List of tuples of (filepath, mixing_coefficient) pairs to additionally add to final mix. Returns ------- filepaths : list List of filepaths used in the mix weights : list List of weights used to mix filepaths """ filepaths, weights = _build_mix_args(mtrack, stem_indices, alternate_weights, alternate_files, additional_files) if len(filepaths) == 1: shutil.copyfile(filepaths[0], output_path) else: cbn = sox.Combiner() cbn.build(filepaths, output_path, 'mix', input_volumes=weights) return filepaths, weights
def get_integrated_lufs(filepath, min_duration=0.5): """ Returns the integrated LUFS for an audiofile. For files shorter than 400 ms ffmpeg returns a constant integrated LUFS value of -70.0. To avoid this, files shorter than min_duration (by default 500 ms) are self-concatenated until min_duration is reached and the LUFS value is computed for the concatenated file. Parameters ---------- filepath : str Path to audio file for computing LUFS min_duration : float Minimum required duration for computing LUFS value. Files shorter than this are self-concatenated until their duration reaches this value for the purpose of computing the integrated LUFS. Caution: if you set min_duration < 0.4, a constant LUFS value of -70.0 will be returned for all files shorter than 400 ms. Returns ------- """ try: duration = sox.file_info.duration(filepath) except Exception as e: raise ScaperError( 'Unable to obtain LUFS for {:s}, error message:\n{:s}'.format( filepath, e.__str__())) if duration < min_duration: # compute how many concatenations we require n_tiles = int(np.ceil(min_duration / duration)) # Concatenate audio to itself, save to temp file and get LUFS tmpfiles = [] with _close_temp_files(tmpfiles): concat_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) tmpfiles.append(concat_file) cbn = sox.Combiner() cbn.build([filepath] * n_tiles, concat_file.name, 'concatenate') loudness_stats = r128stats(concat_file.name) else: loudness_stats = r128stats(filepath) return loudness_stats['I']
def align_mixed(file1, file2, tir): f1name = file1.split('/')[-1] f2name = file2.split('/')[-1] f1speaker = file1.split('/')[-2] f2speaker = file2.split('/')[-2] mix_fname = os.path.join(bin_path, 'mixed', f1speaker + '_' + f2speaker + '_' + str(tir), f1name[:-4] + '_' + f2name) tfn = sox.Transformer() tfn.silence(location=-1) cbn = sox.Combiner() cbn.set_input_format(file_type=['wav', 'wav']) len1 = float(tfn.stat(file1)['Length (seconds)']) len2 = float(tfn.stat(file2)['Length (seconds)']) rms1 = sox.file_info.stat(file1)['RMS amplitude'] rms2 = sox.file_info.stat(file2)['RMS amplitude'] factor = tir_factor(tir, rms1, rms2) os.makedirs(os.path.join(bin_path, 'mixed', f1speaker + '_' + f2speaker + '_' + str(tir)), exist_ok=True) if len1 < len2: tfn.trim(0, len1) tfn.build(file2, os.path.join(bin_path, 'mixed', f2name)) cbn.build([file1, os.path.join(bin_path, 'mixed', f2name)], mix_fname, 'mix', [1, 1 / factor]) shutil.copy(file1[:-4] + '.PHN', mix_fname[:-4] + '.PHN') os.remove(os.path.join(bin_path, 'mixed', f2name)) else: tfn.trim(0, len2) tfn.build(file1, os.path.join(bin_path, 'mixed', f1name)) cbn.build([os.path.join(bin_path, 'mixed', f1name), file2], mix_fname, 'mix', [1, 1 / factor]) shutil.copy(file2[:-4] + '.PHN', mix_fname[:-4] + '.PHN') os.remove(os.path.join(bin_path, 'mixed', f1name)) generate_dict(mix_fname[:-4] + '.PHN') with open(mix_fname[:-4] + '.lab', 'w+') as f: f.write('WORD') os.makedirs(os.path.join(bin_path, 'mixed', 'aligned'), exist_ok=True) subprocess.run([os.path.join('./', bin_path, 'mfa_align'), os.path.join(bin_path, 'mixed', f1speaker + '_' + f2speaker + '_' + str(tir)), mix_fname[:-4] + '.dict', 'english', os.path.join(bin_path, 'mixed', 'aligned', f1speaker + '_' + f2speaker + '_' + str(tir))])
def mix_phonemes(phn1, phn2): phn1_list = [os.path.join(test_set, phn1, file) for file in os.listdir(os.path.join(test_set, phn1))] phn2_list = [os.path.join(test_set, phn2, file) for file in os.listdir(os.path.join(test_set, phn2))] file1 = phn1_list[random.randint(0, phn_occurrence[phn1] - 1)] file2 = phn2_list[random.randint(0, phn_occurrence[phn2] - 1)] rms1 = sox.file_info.stat(file1)['RMS amplitude'] rms2 = sox.file_info.stat(file2)['RMS amplitude'] factor = tir_factor(0, rms1, rms2) cbn = sox.Combiner() cbn.set_input_format(file_type=['wav', 'wav']) cbn.build([file1, file2], 'test/new.wav', 'mix', [1, 1 / factor]) pred = test_mixed('test/new.wav') os.remove('test/new.wav') return pred
def Deep(): try: if tkMessageBox.askyesno("Confirmation", "Would you like to proceed?"): BEAM_WIDTH = 500 LM_WEIGHT = 1.75 WORD_COUNT_WEIGHT = 1.00 VALID_WORD_COUNT_WEIGHT = 1.00 N_FEATURES = 26 N_CONTEXT = 9 ds = Model('models/models.pb', N_FEATURES, N_CONTEXT, 'models/alphabet.txt', BEAM_WIDTH) fs, audio = wav.read(audiofile.get()) if fs != 16000: cbn = sox.Combiner() cbn.convert(samplerate=16000, n_channels=1) cbn.build([str(audiofile.get())], './', 'concatenate') fs, audio = wav.read('./') audio_length = len(audio) * (1 / 16000) resultpage = Toplevel(parent) resultpage.title("Result") result_border = ttk.Frame(resultpage, padding=(12, 12, 12, 12)) result_border.pack() result_page = Frame(result_border, bg="white") result_page.pack() Tkinter.Label(result_page, text="What I've heard from you:", font=14, bg="white").grid(row=1, column=1, sticky=E) Tkinter.Label(result_page, textvariable=word, font=12, bg="white").grid(row=2, column=2, sticky=E) word.set(ds.stt(audio, fs)) except ValueError: tkMessageBox.showerror("Error!", "Only 16000Hz WAV files supported!") except IOError: tkMessageBox.showerror("Error!", "No file uploaded!")
def combine_audio_files(params): cmb = sox.Combiner() cmb.convert(samplerate=22050) cmb.build( [ os.path.join(params['audio_folder'], params['filenames'][0]), os.path.join(params['audio_folder'], params['filenames'][1]), os.path.join(params['audio_folder'], params['filenames'][2]), os.path.join(params['audio_folder'], params['filenames'][3]) ], os.path.join(config.audio_save_folder, params['output_fname']), 'mix') # , 'mix', input_volumes=[0.6, 0.3, 0.3, 0.3]) # if the reverb option is active, this creates the reverb audio files using an IR from Isophonics if params['reverb']: y_ir, sr_ir = librosa.load('./ir/IR_greathall.wav', sr=params['sr']) y_sig, sr_sig = librosa.load(os.path.join(config.audio_save_folder, params['output_fname']), sr=params['sr']) y_rev = scipy.signal.convolve(y_sig, y_ir, mode="full") soundfile.write(os.path.join(config.audio_save_folder, 'reverb', params['output_fname']), y_rev, samplerate=params['sr'])
def _mpls_audio(mplsdict, nocleanup, silent): cliplist = mplsdict['clip'] infiles = [] for clip in cliplist: if clip: infile = str(clip, 'utf-8') infile = os.path.normpath(infile) infiles.append(infile) out_path_prefix = os.path.splitext(str(cliplist[0], 'utf-8'))[0] outfiles = [] concat_files = [] if len(infiles) > 1: try: import sox except ModuleNotFoundError: raise ModuleNotFoundError( 'AudioProcessor.VideoSource: missing sox dependency for concatonating.' ) for infile in infiles: extracted_tracks, framerate, framenum = _extract_tracks_as_wav( infile, silent) concat_files.append(extracted_tracks) for i in range(len(concat_files[0])): combine_files = [ concat_files[j][i] for j in range(len(concat_files)) ] cbn = sox.Combiner() if silent: cbn.set_globals(verbosity=0) formats = ['wav' for file in extracted_tracks] cbn.set_input_format(file_type=formats) outfile = f"{out_path_prefix}_{i+2}_concat.wav" outfiles.append(outfile) cbn.build(combine_files, outfile, 'concatenate') if not nocleanup: for item in concat_files: _cleanup_temp_files(item) return outfiles, framerate, framenum else: outfile = infiles return outfile, None, None
def phonemes_to_audio(syllables: Tuple[str, str], voice_features: Dict, name: str = 'tom_nook') -> None: """音素转音频""" phonemes = [] for initial, final in syllables: if initial: phonemes.append(initial) if final: phonemes.append(final) phoneme_sounds = [f'pinyin_phonemes/{phoneme}.wav' for phoneme in phonemes] synthesized = sox.Combiner() vf = voice_features[name] pitch_shift_random_range, tempo = vf['pitch_shift_random_range'], vf[ 'tempo'] synthesized.pitch(random.uniform(*pitch_shift_random_range)) synthesized.tempo(tempo) synthesized.build(phoneme_sounds, 'synthesized.wav', 'concatenate')
def maybe_convert_one_to_wav(entry): root, _, files = entry transformer = sox.Transformer() transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS) combiner = sox.Combiner() combiner.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS) output_wav = os.path.join(root, WAV_NAME) if os.path.isfile(output_wav): return files = sorted(glob(os.path.join(root, AUDIO_PATTERN))) try: if len(files) == 1: transformer.build(files[0], output_wav) elif len(files) > 1: wav_files = [] for i, file in enumerate(files): wav_path = os.path.join(root, "audio{}.wav".format(i)) transformer.build(file, wav_path) wav_files.append(wav_path) combiner.set_input_format(file_type=["wav"] * len(wav_files)) combiner.build(wav_files, output_wav, "concatenate") except sox.core.SoxError: return
def generate_accompaniment(own_part, solo_parts) -> None: combiner = sox.Combiner() accompaniment_volume_ratio = 0.33 instrumental_volume_ratio = accompaniment_volume_ratio * args.instrumental_volume input_volumes = [] input_files = [] for part in solo_parts: is_own_part = part.name == own_part.name is_instrumental = part.name == 'accompaniment' input_files.append(part.mp3_filepath()) if is_own_part: input_volumes.append(1.0) elif is_instrumental: input_volumes.append(instrumental_volume_ratio) else: input_volumes.append(accompaniment_volume_ratio) output_file_path = "{}/{} with accompaniment.mp3".format( args.output, own_part.name) combiner.build(input_files, output_file_path, 'mix-power', input_volumes)
def gen_audio_file(trs, langin, langout, pattern, fileout): temp = '/tmp' # settle tmp temp += '/mvoad-' + str(time.time()) os.mkdir(temp) tmpfn = temp + '/mvoad.tmp.wav' tmpold = temp + '/mvoad.tmp.dest1.mp3' tmpdest = temp + '/mvoad.tmp.dest.mp3' for tr in trs: untrsname = temp + '/mvoad.src.word.mp3' trsname = temp + '/mvoad.dest.word.mp3' Speech(tr.origin, langin).save(untrsname) Speech(tr.text, langout).save(trsname) for item in pattern: # sox inter = tmpdest tmpdest = tmpold tmpold = inter if (item[0]): filename = trsname else: filename = untrsname t = sox.Transformer() t.tempo(item[1], 's') t.pad(0, item[2]) if pattern.index(item) != 0 or trs.index(tr) != 0: t.build(filename, tmpfn) cbn = sox.Combiner() cbn.build([tmpdest, tmpfn], tmpold, 'concatenate') else: t.build(filename, tmpold) shutil.move(tmpold, fileout) shutil.rmtree(temp)
import os import shutil import glob import sox os.mkdir('making_temp') tfm = sox.Transformer() tfm.pad(0.0, 3.0) files = glob.glob('./jllepd/b*.mp3') for file in files: tfm.build(file, './making_temp/' + os.path.basename(file)) files = sorted(glob.glob('./making_temp/*.*')) cbn = sox.Combiner() cbn.convert(samplerate=44100, n_channels=2) cbn.build(files, './Bb.mp3', 'concatenate') shutil.rmtree('making_temp')
def alignment_helper(file_list, target_path): """Downsample and perform cross-correlation on files relative to a target file to test if they are correctly aligned. Parameters ---------- file_list : list List of files (i.e. stem_files, raw_files) target_path : str Filepath to compare files in file_list to. Returns ------- status : bool True if the cross_correlation values are within a threshold, demonstrating that the files are correctly aligned. """ sr = 1000 output_handle = tempfile.NamedTemporaryFile(suffix='.wav') output_path = output_handle.name if len(file_list) > 1: file_sum = sox.Combiner() file_sum.build(file_list, output_path, 'mix') else: file_sum = sox.Transformer() file_sum.build(file_list[0], output_path) file_sum.rate(sr, 'm') target_handle = tempfile.NamedTemporaryFile(suffix='.wav') target_handle_path = target_handle.name target_sum = sox.Transformer() target_sum.build(target_path, target_handle_path) target_sum.rate(sr, 'm') dur = get_length(target_path) offset = (dur / 44100.0) / 2.0 y_files, sr = librosa.load(output_path, sr=sr, offset=offset, duration=30.0) y_target, sr = librosa.load(target_handle_path, sr=sr, offset=offset, duration=30.0) correlation = np.correlate(y_files, y_target, 'full') N = len(y_target) a = np.arange(1, N + 1) a_rev = np.arange(1, N) b = a_rev[::-1] c = np.concatenate((a, b)) c = c.astype(float) correlation = np.abs(correlation) / c center = N corr_index = np.argmax(correlation) if np.abs(corr_index - center) > 5: return False else: return True
def generate_stream_and_labels( dest_dir, wav_intermediates, wav_data, target_word, target_lang, cv_clipsdir, ): ############################################### ## GENERATE LONG WAVFILE AND LABELS ############################################### # step 1: convert to wavs (to avoid slop in mp3 timings) # food for thought: sox mp3s may have a gap when concatenating # https://stackoverflow.com/questions/25280958/sox-concatenate-multiple-audio-files-without-a-gap-in-between assert os.path.isdir(cv_clipsdir), "cv data not found" assert os.path.isdir(dest_dir), "no dest dir available" assert os.path.isdir( wav_intermediates), "no destination intermediate wav dir available" assert os.listdir( wav_intermediates) == [], "intermediate wav dir not empty" label_file = dest_dir / "streaming_labels.txt" wav_stream_file = str(dest_dir / "streaming_test.wav") assert not os.path.isfile(label_file), "label file exists already" assert not os.path.isfile(wav_stream_file), "wav stream exists already" wavs = [] total_duration_mp3s_s = 0 for ix, stream_component in enumerate(wav_data): mp3name_no_ext = stream_component["mp3name_no_ext"] if ix % 250 == 0: print("mp3 to wav", ix) mp3path = cv_clipsdir / (mp3name_no_ext + ".mp3") if not os.path.exists(mp3path): raise ValueError("could not find", mp3path) duration_s = sox.file_info.duration(mp3path) total_duration_mp3s_s += duration_s wav = str(wav_intermediates / (mp3name_no_ext + ".wav")) transformer = sox.Transformer() transformer.convert(samplerate=16000) # from 48K mp3s transformer.build(str(mp3path), wav) wavs.append(wav) print(total_duration_mp3s_s, "sec = ", total_duration_mp3s_s / 60, "min") print(len(wavs)) # step 2: how long is the sum of each wav according to sox? total_duration_wavs_s = 0 for w in wavs: duration_s = sox.file_info.duration(w) total_duration_wavs_s += duration_s print( "individual wavs:", total_duration_wavs_s, "sec = ", total_duration_wavs_s / 60, "min", ) # step 3: combine the wavs. godspeed. combiner = sox.Combiner() combiner.convert(samplerate=16000, n_channels=1) # https://github.com/rabitt/pysox/blob/master/sox/combine.py#L46 combiner.build(wavs, wav_stream_file, "concatenate") # step 4: how long is the total wavfile? should be the sum of the individual wavs duration_s = sox.file_info.duration(wav_stream_file) print("concatenated wav:", duration_s, "sec = ", duration_s / 60, "min") # step 5: generate labels using the wav file durations, not the sloppy mp3 file durations target_times_s = [] current_sentence_start_s = 0 for ix, stream_component in enumerate(wav_data): mp3name_no_ext = stream_component["mp3name_no_ext"] wavpath = wav_intermediates / (mp3name_no_ext + ".wav") sentence_duration_s = sox.file_info.duration(wavpath) if not stream_component["is_target"]: # add full duration of non-target sentence to current offset current_sentence_start_s += sentence_duration_s continue start_s = stream_component["start_s"] end_s = stream_component["end_s"] target_utterance_start_s = current_sentence_start_s + float(start_s) target_utterance_end_s = current_sentence_start_s + float(end_s) target_times_s.append( (target_utterance_start_s, target_utterance_end_s)) current_sentence_start_s += sentence_duration_s # step 6: write labels out # the label timings should indicate the start of each target utterance in ms with open(label_file, "w") as fh: for start_s, _ in target_times_s: start_ms = start_s * 1000 fh.write(f"{target_word}, {start_ms}\n") return target_times_s
import numpy as np import tensorflow as tf from data.train_data import process_save_wav_sample from data.train_data import process_save_wav_sample_x from data.train_data import AudPreEmphasize from data.baidu_emotion import get_wav_emotion import wave import os import scipy.io.wavfile as wav from data.example import myvad import socket import sox import threading import time cbn = sox.Combiner() #合并音频文件 #myser.get_emotion(test.wav) s = socket.socket() #Create a socket object bsize = s.getsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF) print(bsize) host = socket.gethostname() #Get the local machine name port = 12397 # Reserve a port for your service s.bind(("", port)) #Bind to the port s.listen(5) #Wait for the client connection data = [] ''' 结果 [[0.02977182 0.70166206 0.23576824 0.03279793]] [[2.7817843e-02 8.8926041e-01 8.2342632e-02 5.7907921e-04]]
wav=target_wav, transcript=target_transcript, duration_s=durations_s[0], ) nw_info = dict( ix=2 * ix + 1, wav=non_targets[nontarget_ix][0], transcript=non_targets[nontarget_ix][1], duration_s=durations_s[1], ) stream_info.extend([tw_info, nw_info]) assert len(stream_wavs) == n_stream_wavs * 2, "not enough stream data" stream_wavfile = str(workdir / "covid_stream.wav") combiner = sox.Combiner() combiner.convert(samplerate=16000, n_channels=1) # https://github.com/rabitt/pysox/blob/master/sox/combine.py#L46 combiner.build(stream_wavs, stream_wavfile, "concatenate") dur_info = sum([d["duration_s"] for d in stream_info]) print(sox.file_info.duration(stream_wavfile), "seconds in length", dur_info) with open(stream_info_file, "wb") as fh: pickle.dump(stream_info, fh) # %% # load embedding model traindir = Path(f"/home/mark/tinyspeech_harvard/multilang_embedding")
def process(parameters): tid, n_samples = parameters output_list = output_dir + "dev-other.{}.lst".format(tid) with open(output_list, "w") as fout: for i in range(tid * n_samples, min(len(lines), n_samples * (tid + 1))): line = lines[i] sp = line.split("\t") filename = sp[0] # print(filename) # duration = sox.file_info.duration(filename) alignments = sp[1].strip().split("\\n") # Parse the alignments chunk_starts = [0] chunk_ends = [] words = [] cur_words = [] cur_end = 0 for i, alignment in enumerate(alignments): sp = alignment.split() begin = float(sp[2]) length = float(sp[3]) word = sp[4] cur_end = begin + length if i == 0: continue if word == "$": if length > MIN_SIL_LENGTH: chunk_ends.append(cur_end - TOLERANCE) chunk_starts.append(cur_end - TOLERANCE) words.append(" ".join(cur_words)) cur_words = [] continue cur_words.append(word) if len(cur_words) > 0: chunk_ends.append(cur_end) words.append(" ".join(cur_words)) else: chunk_starts.pop() # print(duration) # print(chunk_starts) # print(chunk_ends) # print(words) # Split the audios order = list(range(len(chunk_starts))) random.shuffle(order) new_target = " ".join([words[i] for i in order]) new_audio_path = output_dir + filename.split("/")[-1] fout.write("{}\t{}\t{}\t{}\n".format(new_audio_path, new_audio_path, chunk_ends[-1] * 1000, new_target)) if len(chunk_starts) == 1: os.system("cp {} {}".format(filename, output_dir)) continue paths = [] for i in order: sox_tfm = sox.Transformer() sox_tfm.set_output_format(file_type="flac", encoding="signed-integer", bits=16, rate=16000) sox_tfm.trim(chunk_starts[i], chunk_ends[i]) new_path = "/tmp/{}_{}.flac".format(tid, i) sox_tfm.build(filename, new_path) paths.append(new_path) # Combine them sox_comb = sox.Combiner() sox_comb.build(list(paths), new_audio_path, "concatenate")