def predictOne(self, path): """ Returns the structure and label from the algorithm specified Removes the first and last boundarie which is the start and the end of the track """ import msaf if self.parameters["feature"].value is None: boundaries, labels = msaf.process( path, boundaries_id=self.parameters["algorithm"].value) else: boundaries, labels = msaf.process( path, boundaries_id=self.parameters["algorithm"].value, feature=self.parameters["feature"].value) return (SparseSignal(labels[1:], boundaries[1:-1]), )
def parts(audio_file, videos_folder): os.system( f'DBNDownBeatTracker --downbeats single "{audio_file}" >> beats.txt') with open('beats.txt', 'r') as f: downbeat_times = list(map(float, f.readlines())) boundaries, labels = msaf.process(audio_file, boundaries_id="foote", labels_id="fmc2d") print("BOUNDARIES", boundaries) print("LABELS", labels) parts_names = [ folder for folder in os.listdir(videos_folder) if len(folder) == 1 and not_empty(os.path.join(videos_folder, folder)) ] labels2ids = { list(set(labels))[i]: parts_names[i % len(parts_names)] for i in range(len(set(labels))) } boundaries_info = {k: [] for k in labels2ids.values()} l_index = 0 for v_min, v_max in zip(boundaries[:-1], boundaries[1:]): boundaries_info[labels2ids[labels[l_index]]].append((v_min, v_max)) l_index += 1 start = 0.0 end = boundaries[-1] downbeat_times = [start] + list(downbeat_times) + [end] clean() return boundaries_info, downbeat_times
def main(wavlist, outdir): for f in wavlist: if f.endswith('.wav'): print("processing {}".format(f)) boundaries, labels = msaf.process(f, boundaries_id="sf", labels_id="vmo",feature="mfcc") labels = [int(l) for l in labels] # conglomerate boundaries by label new_boundaries = [boundaries[0]] new_labels = [labels[0]] for i in xrange(1, len(labels)): if labels[i] == labels[i-1]: continue new_boundaries.append(boundaries[i]) new_labels.append(labels[i]) boundaries = new_boundaries labels = new_labels # read wavfile, parse out segments rate, data = wavfile.read(f) segments = [] for b_ind in xrange(1, len(boundaries)): seg_start = int(np.round(rate*(boundaries[b_ind - 1]))) seg_end = int(np.round(rate*(boundaries[b_ind]))) segments.append(data[seg_start:seg_end]) assert len(segments) + 1 == len(boundaries) == len(labels) # merge short segments new_segments = [segments[0]] new_boundaries = [boundaries[0]] new_labels = [labels[0]] for i in xrange(1, len(segments)): seg = segments[i] if len(seg) < rate*MIN_SEG_LEN: new_segments[i-1] = np.concatenate((new_segments[i-1], seg), axis=0) else: new_segments.append(seg) new_labels.append(labels[i]) new_boundaries.append(labels[i]) if len(new_segments[0]) < rate*MIN_SEG_LEN: new_segments[1] = np.concatenate((new_segments[0], new_segments[1]), axis=0) new_boundaries[1] = new_boundaries[0] new_segments = new_segments[1:] new_boundaries = new_boundaries[1:] new_labels = new_labels[1:] segments = new_segments boundaries = new_boundaries labels = new_labels for i in xrange(len(segments)): outfilename = "{}-clip-{}-label-{}.wav".format(op.splitext(op.basename(f))[0], i, labels[i]) outpath = op.join(outdir, outfilename) wavfile.write(outpath, rate, segments[i]) print("{} created.".format(outfilename))
def segment_song(msd_id): boundaries, labels = msaf.process(get_synthesized_path(msd_id), boundaries_id="olda", labels_id="scluster", feature="mfcc") # merge short segments new_boundaries = [boundaries[0]] new_labels = [labels[0]] for i in range(1, len(boundaries)): if (boundaries[i] - boundaries[i-1]) > MIN_SEGMENT_LEN: new_boundaries.append(boundaries[i]) new_labels.append(labels[i]) boundaries = new_boundaries labels = new_labels # calculate tick values for segments mt_midi = Multitrack(get_npz_path(msd_id)) tempo = mt_midi.tempo[0] beat_resolution = mt_midi.beat_resolution # beats/min * ticks/beat * min/sec = ticks/sec ticks_per_second = (tempo*mt_midi.beat_resolution) // 60 def get_nearest(num, nearest="downbeat"): if nearest == "beat": multiple = beat_resolution elif nearest == "downbeat": multiple = beat_resolution * 4 else: raise ValueError("Argument to get_nearest should be either 'beat' or 'downbeat'") factor = num // multiple remainder = num % multiple if remainder < (multiple // 2): ndb = multiple*factor else: ndb = multiple*(factor + 1) return ndb prev_l = -1 tick_boundaries = [] for b, l in zip(boundaries, labels): if l != prev_l: tick_boundaries.append(get_nearest(b*ticks_per_second, "downbeat")) else: tick_boundaries.append(get_nearest(b*ticks_per_second, "beat")) second_boundaries = [tb/ticks_per_second for tb in tick_boundaries] # change labels to letters and order letter_labels = [] label_map = dict() for label in labels: if label not in label_map: label_map[label] = string.ascii_uppercase[len(label_map)] letter_labels.append(label_map[label]) return second_boundaries, tick_boundaries, letter_labels
def process_boundaries(path_to_read): try: boundaries, labels = msaf.process(path_to_read, n_jobs=1, boundaries_id="scluster", labels_id="scluster") except ValueError: boundaries = [] labels = [] print 'Perhaps path error:%s' % path_to_read print 'boundary and label: done: %s' % path_to_read return (boundaries, labels)
def parts(audio_file): boundaries, labels = msaf.process(audio_file, boundaries_id="foote", labels_id="fmc2d") parts_names = ["A","B","C","D","E","F","G","H","I","J","K","L"] labels2ids = {list(set(labels))[i]:parts_names[i%len(parts_names)] for i in range(len(set(labels)))} boundaries_info = {k:[] for k in labels2ids.values()} l_index = 0 for v_min, v_max in zip(boundaries[:-1], boundaries[1:]): boundaries_info[labels2ids[labels[l_index]]].append((v_min,v_max)) l_index += 1 clean() return boundaries_info
def getPerformanceData(audioFile): # Load the audio file with a sampling rate of 44100 Hz x, fs = librosa.load(audioFile, sr=44100) print("File \'" + audioFile + "\' loaded.") # Calculate the duration of the audio file duration = int(10 * librosa.get_duration(x, fs)) # Calculate the emotion of the audio file and get the associated colors print("Calculating emotion data...") colors = groupColor(getVApair(audioFile)) # Get the percussive elements of the of the audio file print("Extracting percussive elements...") xPercussive = librosa.effects.percussive(x, margin=3.0) #xPercussive=x # Get the beats of the audio file print("Detecting beats...") tempo, beats = librosa.beat.beat_track(xPercussive, sr=44100) beatSampleTimes = librosa.frames_to_time(beats, sr=fs) beatDsTimes = [int(10 * round(b, 1)) for b in beatSampleTimes] # Get the onsets of the audio file print("Detecting onsets...") onsets = librosa.onset.onset_detect(xPercussive, sr=44100) onsetSampleTimes = librosa.frames_to_time(onsets, sr=fs) onsetDsTimes = [int(10 * round(o, 1)) for o in onsetSampleTimes] # Get the segment boundaries print("Segmenting audio file...") boundaries, labels = msaf.process(audioFile, boundaries_id="sf") # Clean the boundaries to remove redundant segments boundariesDs = cleanSegments([int(10 * round(f, 1)) for f in boundaries], duration) # Store the results of the segmentation in segments.txt for faster retrieval # during later segmentations outFile = 'segments.txt' print('Saving output to %s' % outFile) msaf.io.write_mirex(boundaries, labels, outFile) # Return a dictionary of the relevant performance data performanceData = { "colors": colors, "waveValues": x, "duration": duration, "beats": beatDsTimes, "onsets": onsetDsTimes, "boundaries": boundariesDs } return performanceData
def get_boundaries(audio_file: str, sample_rate: int, boundary_detection_id: str = "olda", label_detection_id: str = "scluster"): ''' Parses labels and sections to generate a mapping of labels to actual sections and a measure of how percussive each section is Parameters ---------- audio_file: filename of desired audio file sample_rate: sample_rate of audio file boundary_detection_id: Algorithm for boundary detection. Other algorithms are documented here: https://github.com/urinieto/msaf/blob/master/examples/Run%20MSAF.ipynb label_detection_id: Algorithm for label_detection. Other algorithms are documented here: https://github.com/urinieto/msaf/blob/master/examples/Run%20MSAF.ipynb Returns ------- sections: A list of tuples designating the start/end of each section in samples dirty_labels: A list of integers corresponding to the label each section has been assigned (e.g. a song structure of ABCAB could have a list of [0, 1, 2, 0, 1]) ''' bounds, labels = msaf.process(audio_file, boundaries_id=boundary_detection_id, labels_id=label_detection_id, out_sr=sample_rate) sections = np.array([(int(bounds[i] * sample_rate), int(bounds[i + 1] * sample_rate)) for i in range(len(bounds) - 1)]) boolarr = [ True if end - start > sample_rate else False for start, end in sections ] sections = sections[boolarr] labels = np.array(labels)[boolarr] dirty_labels = [int(i) for i in labels] return sections, dirty_labels
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print 'run_msaf.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'run_msaf.py -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg print 'Input file is "', inputfile print 'Output file is "', outputfile boundaries, labels = msaf.process(inputfile, labels_id='scluster') print('Estimated boundaries:', boundaries) print('Estimated labels:', labels) msaf.io.write_mirex(boundaries, labels, outputfile)
input_audio = audio_path + "testb.wav" config = { "dirichlet": True, "xmeans": True, "k": 6, "M_gaussian": 16, "m_embedded": 3, "k_nearest": 0.06, "Mp_adaptive": 24, "offset_thres": 0.04 } est_times, est_labels = msaf.process(input_audio, feature="hpcp", boundaries_id="sf", labels_id="fmc2d", config=config) print 'writing data' labels_file = open(labels_path, "w") data = zip(est_times, est_labels) names = [random.choice(words) for i in range(len(est_labels))] for i in data: labels_file.write("%s %s\n" % (i[0], names[int(i[-1])])) labels_file.close()
def main(args, callback=log_progress()): if isinstance(callback, types.GeneratorType): next(callback) start = time.time() print('Analyzing music %s...'%args.input) if not os.path.exists(args.input): raise FileNotFoundError elif not os.path.isdir(args.data) and not args.data[-4:] == '.csv': if not os.path.exists(args.data): raise FileNotFoundError else: raise Exception('The data path must either be a .csv file or a folder') # 1. Get major changes in music callback.send('(1/3) Identifying significant rythm changes in music...\n This will take about a minute.') filterwarnings('ignore') boundaries, labels = msaf.process(args.input, boundaries_id='olda') if boundaries[-1] < 60 or boundaries[-1]>400: callback.send('Error : Please chose a music lasting between 60 and 400 seconds for getting a quality MV.') return -1 callback.send('Key changes found at \n(%s) seconds\n'%' , '.join(map('{:.2f}'.format, boundaries))) if args.data[-4:] == '.csv': # 2. Find music genre and style (music video style = larger category of genre) musicGenre = args.genre musicStyle = '' if musicGenre == '': # No genre given, must find it title, artist, musicGenre, musicStyle = get_music_infos(args.input) if musicStyle == '': callback.send('GenreError : The algorithm did not manage to recognize the music genre.\n' 'Please try with another music, or manually add genre with the argument --genre <name of genre> \n' 'with genre in ('+','.join(AUTHORIZED_GENRES)+').') return -1 callback.send('Music genre identified : %s.'%musicGenre) else: musicStyle = convert_genre_to_style(musicGenre) if musicStyle == '': callback.send('GenreError : This genre is not authorized. Please input one of the following ('+\ ','.join(AUTHORIZED_GENRES)+') or let the algorithm find the genre.') return -1 # 3. With the music genre, find appropriate videos in database callback.send('(2/3) Fetching matching videos in database...\n') # use k-means clustering result on scenes extracted from Music Videos with same genre and chose one resolution resolution = random.random() if resolution < RESOLUTION_PROBABILITY : resolution = '40' else: resolution = '16' clusterResult = pd.read_csv('/home/sarah/YoutubeMVGenerator/statistics/kmeans_'+resolution+'_'+musicStyle+'.csv') else: # use k-means clustering result on scenes extracted from Music Videos with same genre listFiles = list_scenes(args.data,'json') callback.send('(2/3) Generating K-Means for the database...') clusterResult = compute_kmeans(listFiles) # 4. Join music scenes while respecting the clustering and the input music rythm callback.send('(3/3) Building the music video around these boundaries...\n This won \'t take long.\n') # Select and order videos for music clip tempDir = tempfile.mkdtemp('_music_video_build')+'/' print("Building the video file in folder %s"%tempDir) assemble_videos(clusterResult, boundaries, tempDir) # Concatenate videos subprocess.call(['ffmpeg', '-y', '-loglevel', 'error', '-f', 'concat', '-safe', '0', '-i', 'video_structure.txt', '-c', 'copy', '-an', tempDir+'temp_video.MTS']) # Put input music on top of resulting video extension = os.path.splitext(args.output)[1] if extension != '.avi' and extension != '.mkv': args.output = os.path.splitext(args.output)[0]+'.mp4' if extension != '.mp4' : print('No format within (avi,mkv,mp4) given. Using default mp4 ...') # copies video stream and replace audio of arg 0 by arg 1 subprocess.call(['ffmpeg', '-y', '-loglevel', 'error', '-i', tempDir+'temp_video.MTS', '-i', args.input, '-c:v' ,'copy', '-map', '0:v:0', '-map', '1:a:0', args.output]) print('Video file %s written.\n'%args.output) callback.send('--- Finished building the music video in %f seconds. ---'%(time.time()-start)) # Delete temp files shutil.rmtree(tempDir) # Copy video to folder generated if os.path.exists('generatedmvs'): shutil.copyfile(args.output, 'generatedmvs/'+time.strftime('%Y-%m-%d_%H-%M-%S', time.gmtime())+'.mp4') if callback is not None and isinstance(callback, types.GeneratorType): # Close the generator callback.close()
def line_align(songs, dump_dir, boundary_algorithm='olda', label_algorithm='fmc2d', do_twinnet=False): """ Aligns given audio with lyrics by line. If dump_dir is None, no timestamp yml is created. :param songs: Song metadata in dict with keys 'song', 'artist', 'path' and \ 'genre'. Key 'path' is audio file path. Key 'genre' optional. :type songs: list[dict{}] | dict{} :param dump_dir: Directory to store timestamp ymls. :type dump_dir: file-like | None :param boundary_algorithm: Segmentation algorithm for MSAF. :type boundary_algorithm: str :param label_algorithm: Labelling algorithm for MSAF. :type label_algorithm: str :param do_twinnet: Flag for performing vocal isolation. :type do_twinnet: bool :return align_data: List of alignment data. See below for formatting. :rtype: list[dict{}] """ logging.info('Beginning alignment...') if isinstance(songs, dict): songs = [songs] # Module initializations snd = SND(silencedb=-15) sc = SyllableCounter() # Perform MaD TwinNet in one batch if do_twinnet: paths = [song['path'] for song in songs] twinnet.twinnet_process(paths) else: #logging.info('Skipping MaD TwinNet') print('Performing source separation using spleeter..') audio_path = songs[0]['path'] destination = os.path.splitext(audio_path)[0] if not os.path.exists(destination): separator = Separator('spleeter:2stems') separator.separate_to_file(audio_descriptor=audio_path, destination=destination) total_align_data = [] for song in songs: logging.info('Processing {} by {}'.format(song['song'], song['artist'])) start_time = time.time() # Get file names mixed_path = song['path'] voice_path = os.path.splitext(song['path'])[0] + '_voice.wav' if not do_twinnet: voice_path = os.path.join(destination, 'vocals.wav') # Get lyrics from Genius lyrics = get_lyrics(song['song'], song['artist']) # Get syllable count from lyrics formatted_lyrics = sc.build_lyrics(lyrics) syl_lyrics = sc.get_syllable_count_lyrics(formatted_lyrics) sc_syllables = sc.get_syllable_count_per_section(syl_lyrics) # Get syllable count from SND snd_syllables = snd.run(voice_path) # Structural segmentation analysis on original audio sections, labels = msaf.process(mixed_path, boundaries_id=boundary_algorithm, labels_id=label_algorithm) # Save instrumental section indices instrumentals = [] # Get SND counts, densities per label max_count = 0 labels_density = {} i_s = 0 for i, section in enumerate(zip(labels, sections[:-1], sections[1:])): count = 0 while i_s < len(snd_syllables) and snd_syllables[i_s] < section[2]: count += 1 i_s += 1 max_count = max(max_count, count) duration = section[2] - section[1] density = count / duration # TODO: Improve instrumental categorization if density < 0.4: instrumentals.append(i) else: if section[0] not in labels_density: labels_density[section[0]] = [[], []] labels_density[section[0]][0].append(count) labels_density[section[0]][1].append(density) # if section[0] not in labels_density: # labels_density[section[0]] = [[], []] # labels_density[section[0]][0].append(count) # labels_density[section[0]][1].append(density) # Normalize SND syllable counts for label in labels_density: labels_density[label][0] = [ count / max_count for count in labels_density[label][0] ] # Normalize SSA syllable counts gt_max_syl = max(section[1] for section in sc_syllables) gt_chorus_syl = mean(section[1] / gt_max_syl for section in sc_syllables if section[0] == 'chorus') # Find label most similar to chorus min_label = labels[0] min_distance = float('inf') for label in labels_density: if len(labels_density[label][0]) < 2: continue # TODO: Fix distance scales mean_syl = mean(labels_density[label][0]) std_den = stdev(labels_density[label][1]) distance = sqrt(((mean_syl - gt_chorus_syl) / gt_chorus_syl)**2 + std_den**2) if distance < min_distance: min_distance = distance min_label = label # Relabel relabels = [''] * len(labels) temp = defaultdict(list) for i, label in enumerate(labels): temp[label].append(i) for label in temp: for i in temp[label]: if i in instrumentals: continue elif label == min_label: relabels[i] = 'chorus' elif len(temp[label]) > 1: relabels[i] = 'verse' else: relabels[i] = 'other' del temp relabels = [label for label in relabels if label] if not relabels: logging.error('Whole song tagged as instrumental! Skipping...') continue # Calculate accumulated error matrix dp = [[-1 for j in range(len(relabels))] for i in range(len(sc_syllables))] for i in range(len(sc_syllables)): for j in range(len(relabels)): dp[i][j] = dp_err_matrix[sc_syllables[i][0]][relabels[j]] if i == 0 and j == 0: pass elif i == 0: dp[i][j] += dp[i][j - 1] elif j == 0: dp[i][j] += dp[i - 1][j] else: dp[i][j] += min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) # Backtrack i, j = len(sc_syllables) - 1, len(relabels) - 1 path = [] while True: path.append((i, j)) if (i, j) == (0, 0): break elif i == 0: j -= 1 elif j == 0: i -= 1 else: min_dir = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) if dp[i - 1][j] == min_dir: i -= 1 elif dp[i][j - 1] == min_dir: j -= 1 else: i -= 1 j -= 1 path.reverse() # Process alignment and write to file alignment = [[] for i in range(len(labels))] for i in instrumentals: alignment[i].append('instrumental') section_id = 0 j_prev = 0 for (i, j) in path: if j != j_prev: section_id += 1 j_prev = j while 'instrumental' in alignment[section_id]: section_id += 1 alignment[section_id].append(i) end_time = time.time() align_data = { 'song': song['song'], 'artist': song['artist'], 'process time': end_time - start_time, 'duration': round((sections[-1] - sections[0]).item(), 2), 'align': [] } if 'genre' in song: align_data['genre'] = song['genre'] cur_lyric_section = -1 for i, section in enumerate(alignment): for n, lyric_section in enumerate(section): if lyric_section != cur_lyric_section: break_point = round(( sections[i] + n * (sections[i + 1] - sections[i]) / len(section)).item(), 2) if cur_lyric_section != 'instrumental' and align_data[ 'align']: align_data['align'][-1]['end'] = break_point if lyric_section != 'instrumental': align_data['align'].append({ 'label': sc_syllables[lyric_section][0], 'syllables': sc_syllables[lyric_section][1], 'start': break_point, 'lines': [] }) cur_lyric_section = lyric_section if 'end' not in align_data['align'][-1]: align_data['align'][-1]['end'] = break_point for i, section in enumerate(align_data['align']): duration = section['end'] - section['start'] line_start = section['start'] for j, line in enumerate(formatted_lyrics[i][1]): line_text = ' '.join(line) line_syls = sum(syl_lyrics[i][1][j]) line_duration = line_syls / align_data['align'][i][ 'syllables'] * duration align_data['align'][i]['lines'].append({ 'end': line_start + line_duration, 'text': line_text }) line_start += line_duration if dump_dir is not None: file_name = '{}_{}.yml'.format(song['artist'], song['song']).replace(' ', '') file_path = os.path.join(dump_dir, file_name) with open(file_path, 'w') as f: yaml.dump(align_data, f, default_flow_style=False) total_align_data.append(align_data) return total_align_data
#chorus_start_sec = find_and_output_chorus("audio/foo.wav", "chorus.wav", 15) num_samples = chroma.shape[1] time_time_similarity = TimeTimeSimilarityMatrix(chroma, sr) time_lag_similarity = TimeLagSimilarityMatrix(chroma, sr) #time_time_similarity.display() print(msaf.get_all_label_algorithms()) print(msaf.get_all_boundary_algorithms()) #novelty based segmentation #uses the foote or checkerboard kernel method of segmenting songs #plot = True boundaries, labels = msaf.process("audio/" + file_name, feature="mfcc", boundaries_id="foote", labels_id="fmc2d", out_sr=sr) #audio = librosa.load(sonified_file, sr=sr)[0] new_boundaries = [] new_labels = [] segment_nums = [] mfccs = [] idx = 0 for x in range(len(boundaries) - 1): if boundaries[x + 1] - boundaries[x] >= 3: print("segment found at {0:g} min {1:.2f} sec".format( boundaries[x] // 60, boundaries[x] % 60)) segment_wav_data = song_wav_data[int(boundaries[x] *
labels_path = root+"labels.txt" input_audio = audio_path + "testb.wav" config = { "dirichlet" : True, "xmeans" : True, "k" : 6, "M_gaussian" : 16, "m_embedded" : 3, "k_nearest" : 0.06, "Mp_adaptive" : 24, "offset_thres" : 0.04 } est_times, est_labels = msaf.process(input_audio, feature="hpcp", boundaries_id="sf", labels_id="fmc2d",config=config) print 'writing data' labels_file = open(labels_path,"w") data = zip(est_times,est_labels) names = [random.choice(words) for i in range(len(est_labels))] for i in data: labels_file.write("%s %s\n" % (i[0],names[int(i[-1])])) labels_file.close()
# # # Use MSAF to analyze structure of a song. # # import msaf msaf.config.dataset.audio_dir = "." import librosa beat = tracker.Beat() beat.load_metadata('track_info') trackids = beat.track_info.trackid collection = data_dir + "/audio/tmp_wav_set/" results = msaf.process(collection, n_jobs=1, boundaries_id="foote", feature='pcp') trackid=6 track_audiopath = beat.audio_orig_dir + beat.track_info.loc[beat.track_info.trackid==trackid]["filename_track"].iloc[0]+".wav" boundaries, labels = msaf.process(track_audiopath) sonified_file = "my_boundaries.wav" sr = 44100 boundaries, labels = msaf.process(track_audiopath, sonify_bounds=True, out_bounds=sonified_file, out_sr=sr) melids = np.unique(beat.sections.melid.loc[beat.sections.trackid==trackid])
def extract_segments(song_name): file_format = ".wav" audio_file = song_name + file_format # 2. Segment the file using the default MSAF parameters (this might take a few seconds) boundaries, labels = msaf.process(audio_file, boundaries_id="olda", labels_id="scluster") labels = [int(i) for i in labels] segCount = 0 currIndex = 0 firstSegLen = 0 usedLabels = [0] * 5 while (firstSegLen < 5): lbound = boundaries[currIndex] rbound = boundaries[currIndex + 1] segLen = rbound - lbound firstSegLen += segLen firstRt = rbound currIndex += 1 usedLabels[labels[currIndex]] = 1 segmentList = [] segmentList.append((0, firstRt)) while (currIndex < len(labels)): if (usedLabels[labels[currIndex]] == 0): lbound = boundaries[currIndex] rbound = boundaries[currIndex + 1] segLen = rbound - lbound if (segLen > 15): segmentList.append((lbound, rbound)) currIndex += 1 else: if (currIndex != len(labels) - 1): if (labels[currIndex] == labels[currIndex + 1]): rbound = boundaries[currIndex + 2] segLen = rbound - lbound if (segLen > 15): segmentList.append((lbound, rbound)) currIndex += 2 currIndex += 1 y, sr = librosa.load(audio_file) tempo, beats = librosa.beat.beat_track(y=y, sr=sr) beatsList = librosa.frames_to_time(beats, sr=sr) fourBeatsList = beatsList[3::4] beatDist = fourBeatsList[1] - fourBeatsList[0] for i in range(100): fourBeatsList = np.append(fourBeatsList, fourBeatsList[-1] + beatDist) sound = AudioSegment.from_file(audio_file) totalLen = 0 for i in range(len(segmentList)): totalLen += segmentList[i][1] - segmentList[i][0] songLen = len(sound) / 1000.0 if (totalLen <= 35): segmentList.append((songLen - 23, songLen - 3)) def closest(lst, K): return lst[min(range(len(lst)), key=lambda i: abs(lst[i] - K))] newSegmentList = [] for i in range(len(segmentList)): start = segmentList[i][0] end = segmentList[i][1] newStart = closest(fourBeatsList, start) newEnd = closest(fourBeatsList, end) if (i == 0): newStart = 0 newSegmentList.append((newStart, newEnd)) mixedFile = sound[newSegmentList[0][0] * 1000:newSegmentList[0][1] * 1000] finalSegmentList = [] totalTime = newSegmentList[0][1] - newSegmentList[0][0] finalSegmentList.append(newSegmentList[0]) currIter = 1 while (totalTime <= 55): if (currIter == len(newSegmentList)): break totalTime += newSegmentList[currIter][1] - newSegmentList[currIter][0] finalSegmentList.append(newSegmentList[currIter]) currSound = sound[newSegmentList[currIter][0] * 1000:newSegmentList[currIter][1] * 1000] mixedFile = mixedFile.append(currSound, crossfade=beatDist * 1000) currIter += 1 lastStart = len(sound) / 1000.0 - 5 lastEnd = len(sound) / 1000.0 lastStart = closest(fourBeatsList, lastStart) finalSegmentList.append((lastStart, lastEnd)) lastEnd = closest(fourBeatsList, lastEnd) lastSound = sound[lastStart * 1000:] mixedFile = mixedFile.append(lastSound, crossfade=beatDist * 1000) mixedFile.export(song_name + '_segmented.mp3', format="mp3") print("Final Audio Segments Obtained: ") print(finalSegmentList) return (mixedFile, finalSegmentList, beatDist)
def segment(fileLocation, toLocation,songLocation, songToLocation): print("Processing instrumental list...") for path, dir, files in os.walk(fileLocation): for filename in files: if filename.endswith(".mp3"): sound = AudioSegment.from_mp3(os.path.join(path, filename)) filename = os.path.splitext(filename)[0] filename = filename+".wav" sound.export(os.path.join(toLocation, filename), format="wav") if not filename.endswith(".wav"): print ("Please check your audio file type: " + filename) continue audio_file = os.path.join(path, filename) song = AudioSegment.from_wav(audio_file) print ('Segment ' + audio_file) # Segment the file using default MSAF parameters boundaries, labels = msaf.process(audio_file) print(boundaries) songBoundary[filename[:filename.rfind('(inst')]] = boundaries ''' Using unit in milliseconds(ten_seconds = 10 * 1000) first_10_seconds = song[:ten_seconds] last_5_seconds = song[-5000:] ''' segments = list() boundaries *= 1000 buff = 2500 for index in xrange(1,len(boundaries)): if index == 1 or index == len(boundaries)-2 : continue elif index == 2 or index == len(boundaries)-1 : segments.append(song[max(0, boundaries[index-2]-buff) : min(boundaries[len(boundaries)-1], boundaries[index]+buff)]) else: segments.append(song[boundaries[index-1]-buff:boundaries[index]+buff]) for index in xrange(len(segments)): output = filename[:filename.rfind('.')] + '_' + str(index+1) + filename[filename.rfind('.'):] out_format = filename.split('.')[-1] segments[index].export(os.path.join(toLocation, output), format = out_format) print("Processing vocal list...") for path, dir, files in os.walk(songLocation): for filename in files: if filename.endswith(".mp3"): sound = AudioSegment.from_mp3(os.path.join(path, filename)) filename = os.path.splitext(filename)[0] filename = filename+".wav" sound.export(os.path.join(songToLocation, filename), format="wav") if not filename.endswith(".wav"): print ("Please check your audio file type: " + filename) continue audio_file = os.path.join(path, filename) song = AudioSegment.from_wav(audio_file) print ('Segment ' + audio_file) segments = list() if filename.rfind('(vocal') != -1: boundaries = songBoundary[filename[:filename.rfind('(vocal')]] else: boundaries = songBoundary[filename[:filename.rfind('.')]] print (boundaries) for index in xrange(1,len(boundaries)): if index == 1 or index == len(boundaries)-2 : continue elif index == 2 or index == len(boundaries)-1 : segments.append(song[max(0, boundaries[index-2]-buff) : min(boundaries[len(boundaries)-1], boundaries[index]+buff)]) else: segments.append(song[boundaries[index-1]-buff:boundaries[index]+buff]) for index in xrange(len(segments)): output = filename[:filename.rfind('.')] + '_' + str(index+1) + filename[filename.rfind('.'):] out_format = filename.split('.')[-1] segments[index].export(os.path.join(songToLocation, output), format = out_format)