def multiChorusSections(intvs, dur): """avoid intersection of tuned intervals""" def key(x): # timestamp with precision of 0.01s return int(x * 100) # value 1=chorus begin -1=chorus end boundaries = defaultdict(int) for intv in intvs: boundaries[key(intv[0])] += 1 boundaries[key(intv[1])] -= 1 intervals, labels = [[0, 0]], [CLF_NON_TARGET_LABEL] state = 0 # 0:others >0:chorus for bdr in sorted(boundaries.keys()): t = bdr / 100.0 intervals[-1][1] = t intervals.append([t, 0]) state += boundaries[bdr] if state == 0: labels.append(CLF_NON_TARGET_LABEL) elif state > 0: labels.append(CLF_TARGET_LABEL) else: logger.error(f"invalid state, boundaries={boundaries}") intervals[-1][1] = dur mirexFmt = (np.array(intervals), np.array(labels, dtype="U16")) logger.debug(f"multi chorus sections, output=\n{mirexLines(mirexFmt)}") return mergeIntervals(mirexFmt)
def buildRecurrence(cliques, times): logger.debug(f"build recurrence") cliques = deepcopy(cliques) size = len(times) - 1 mergedCliquesList = [ smoothCliques( mergeAdjacentCliques(cliques, dis=dis, dblock=dblock), size, kernel_size=kernelSize, ) for dis in DELTA_DIS_RANGE for kernelSize in SMOOTH_KERNEL_SIZE_RANGE for dblock in [0, 1, 2] ] # mclen = len(mergedCliquesList) # for i in range(1): # mergedCliquesList.extend( # [ # smoothCliques(mergeAdjacentCliques(cs), size) # for cs in mergedCliquesList[-mclen:] # ] # ) errors = [error(cliques, ncs, size, times) for ncs in mergedCliquesList] indices = np.argsort(errors) for i in indices: newCliques = mergedCliquesList[i] predicate = all( [ len(newCliques) >= MIN_STRUCTURE_COUNT, ] ) if predicate: return newCliques logger.warn(f"seqrecur failed, cliqueLengths={[len(x) for x in mergedCliquesList]}") return mergedCliquesList[0]
def SSL(self, wavPath, output, sr=SAMPLE_RATE): """<Semi-supervised learning using teacher-student models for vocal melody extraction>""" dirname = os.path.dirname(output) + "/" output = f"pitch_{os.path.basename(wavPath)}.txt" output = os.path.join(dirname, output) commands = ("python", "./melodyExtraction_NS.py", "-p", wavPath, "-o", dirname) logger.debug(f"SSL commands={commands}") ret = subprocess.call(commands, cwd=ALGO_BASE_DIRS["SSL"]) assert ret == 0, f"return value: {ret} != 0" times, pitches = load_time_series(output, delimiter=r"\s+|,") return {"times": times, "pitches": pitches}
def tuneIntervals(mirexFmt, mels_f, chorusDur, window): mirexFmt = removeNumber(mirexFmt) mirexFmt = mergeIntervals(mirexFmt) logger.debug(f"tune interval=\n{mirexLines(mirexFmt)}") dur = mirexFmt[0][-1][1] intvs = filterIntvs(mirexFmt, fun=CLF_TARGET_LABEL) tuneIntvs = [] times, pitches = mels_f for intv in intvs: begin = arousalPoint(intv[0], times, pitches, window, True) end = arousalPoint(intv[1], times, pitches, window, False) end = min(dur, max(end, begin + chorusDur)) if end - begin > MINIMUM_CHORUS_DUR: tuneIntvs.append((begin, end)) return multiChorusSections(tuneIntvs, dur)
def feature2W(feature, size, aggregator, simFunction, wins_per_block=20, K=5): intervals = resize(feature, size) # feature[<dim>, <frame>] -> [<dim>, <interval number>], intervals=frames//(size-1) feature = librosa.util.sync(feature, intervals, aggregate=aggregator) # Xfeature[<interval number>, <dim>*<wins_per_block>] Xfeature = librosa.feature.stack_memory(feature, n_steps=wins_per_block, mode="edge").T Dfeature = simFunction(Xfeature, Xfeature) # Wfeature[<interval number>, <interval number>] Wfeature = getW(Dfeature, K) assert not np.isnan( np.sum(Wfeature)), f"invalid affinity, Dfeature={Dfeature}" logger.debug( f"shapes, feature{feature.shape} Xfeature{Xfeature.shape} Wfeature{Wfeature.shape}" ) return Wfeature
def error(origCliques, mergedCliques, size, times, show=False): olssm = getLabeledSSM(origCliques, size) mlssm = getLabeledSSM(mergedCliques, size) olssm[olssm > 0] = 1 mlssm[mlssm > 0] = 1 # false negative + false positive fnerr = np.sum((mlssm == 0) & olssm) / (np.sum(olssm) + EPSILON) fperr = np.sum(mlssm & (olssm == 0)) / (np.sum(olssm == 0) + EPSILON) err = fnerr + max(0, fperr - FALSE_POSITIVE_ERROR) logger.debug(f"errs={fnerr:.5f},{fperr:.5f} sum={err:.3f} len={len(mergedCliques)}") if show: x, xm = getLabeledSSM(origCliques, size), getLabeledSSM(mergedCliques, size) labels = [x[i, i] for i in range(size)] xm[xm > 0] = 10 plt.imshow(x + xm) plt.plot(labels) plt.show() return err
def mergeAdjacentCliques(cliques, dis=ADJACENT_DELTA_DISTANCE, dblock=0): logger.debug(f"merge cliques, dis={dis} dblock={dblock}") size = len(cliques) adjLists = [[] for i in range(size)] # i < j: adjLists[j] = [..., i, ...] # calculate adjacency matrix for i in range(size): for j in range(i + 1, size): if isAdjacent(cliques[i], cliques[j], dis=dis, dblock=dblock): adjLists[j].append(i) # merge cliques in transitive closure # key:smallest clique label in connected component # value:frame number list cliquesDic = defaultdict(list) labels = mergeFind(adjLists, size) for i in range(size): cliquesDic[labels[i]].extend(cliques[i]) newCliques = list(cliquesDic.values()) newCliques = sorted(newCliques, key=lambda c: c[0]) return newCliques
def plotMats(matrices, titles, show=DEBUG): logger.debug(f"plot mats[{len(matrices)}]:") if len(matrices) > 3: _, axis = plt.subplots(2, (len(matrices) + 1) // 2) else: _, axis = plt.subplots(1, len(matrices)) if len(matrices) == 1: axis = np.array([axis]) axis = axis.flatten() for i, mat in enumerate(matrices): logger.debug( f"{titles[i]}{mat.shape}, min={np.min(mat)}, max={np.max(mat)}") ax = axis[i] ax.set_title(f"({string.ascii_lowercase[i]}) {titles[i]}") extent = [-1, len(mat) * SSM_TIME_STEP] ax.imshow(mat, interpolation="none", extent=extent + extent[::-1]) ax.set_xlabel("time/s") # fig.colorbar(im, orientation=orien, ax=ax) plt.tight_layout() if show: plt.show()
def arousalPoint(time, times, pitches, window, begin, show=DEBUG): def arousalScore(t): before = pitches[(times >= t - TUNE_SCOPE / 2) & (times <= t)] after = pitches[(times >= t) & (times <= t + TUNE_SCOPE / 2)] before = (librosa.hz_to_midi(before + 0.1) * 6 / 12).astype(int) after = (librosa.hz_to_midi(after + 0.1) * 6 / 12).astype(int) score = np.sum(after) - np.sum(before) return score / len(before) mask = (times >= time - window / 2) & (times <= time + window / 2) scores = [arousalScore(t) for t in times[mask]] point = times[mask][np.argmax(scores)] if begin else times[mask][np.argmin(scores)] if show: logger.debug( f"point={point} times={times[mask][0]}~{times[mask][-1]} window={window}" ) plt.plot(times[mask], pitches[mask], label="pitch") plt.plot(times[mask], scores, label="score") plt.scatter(point, np.max(scores) if begin else np.min(scores)) plt.xlabel("time/s") plt.ylabel("freq/Hz") plt.legend() plt.show() return point
def selfSimilarityMatrix( wavfile, mel=None, win_fac=10, wins_per_block=20, K=5, sr=22050, hop_length=512, ): logger.debug(f"loading:{wavfile}") y, sr = librosa.load(wavfile, sr=sr) nHops = (y.size - hop_length * (win_fac - 1)) / hop_length intervals = np.arange(0, nHops + 1e-6, win_fac).astype(int) logger.debug( f"nHops={nHops}=(size-hop_length*(win_fac-1))/hop_length=({y.size} - {hop_length}*({win_fac}-1))/{hop_length} intvs={intervals[-1]}" ) # chorma chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length, bins_per_octave=12 * 3) # mfcc S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128, hop_length=hop_length) log_S = librosa.power_to_db(S, ref=np.max) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20) lifterexp = 0.6 coeffs = np.arange(mfcc.shape[0])**lifterexp coeffs[0] = 1 mfcc = coeffs[:, None] * mfcc # tempogram SUPERFLUX_SIZE = 5 oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length, max_size=SUPERFLUX_SIZE) tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length) # generate W- matrices n_frames = np.min([chroma.shape[1], mfcc.shape[1], tempogram.shape[1]]) intervals = librosa.util.fix_frames(intervals, x_min=0, x_max=n_frames) times = intervals * float(hop_length) / float(sr) size = n_frames // win_fac logger.debug( f"frames fixed, intervals={intervals[-1]} hop={intervals[1]-intervals[0]} size={size}" ) WMfcc = feature2W(mfcc, size, np.mean, getCSM, wins_per_block=wins_per_block) WChroma = feature2W( chroma, size, np.median, getShiftInvariantCSM(getCSMCosine, wins_per_block), wins_per_block=wins_per_block, ) WTempo = feature2W(tempogram, size, np.mean, getCSM, wins_per_block=wins_per_block) printArray(WMfcc, "mfcc") printArray(WChroma, "chorma") printArray(WTempo, "tempo") # melody if mel is not None: _, pitches = mel pitches = pitchChroma(pitches) WPitches = feature2W( pitches, size, np.median, getShiftInvariantCSM(getCSMCosine, wins_per_block), wins_per_block=wins_per_block, ) printArray(WPitches, "pitchChroma") Ws = [WMfcc, WChroma, WPitches, WTempo] else: Ws = [WMfcc, WChroma, WTempo] if REC_SMOOTH > 0: df = librosa.segment.timelag_filter(scipy.ndimage.median_filter) Ws = [df(W, size=(1, REC_SMOOTH)) for W in Ws] W = doSimilarityFusionWs(Ws, K=K, niters=3, reg_diag=1.0, reg_neighbs=0.5) printArray(W, "fused W") res = { "Ws": { "Fused": W, "Melody": WPitches if mel is not None else None, }, "times": times, } return res
def doSimilarityFusionWs( Ws, K=5, niters=20, reg_diag=1, reg_neighbs=0.5, verboseTimes=True, ): """ Perform similarity fusion between a set of exponentially weighted similarity matrices :param Ws: An array of NxN affinity matrices for N songs :param K: Number of nearest neighbors :param niters: Number of iterations :param reg_diag: Identity matrix regularization parameter for self-similarity promotion :param reg_neighbs: Neighbor regularization parameter for promoting adjacencies in time :param PlotNames: Strings describing different similarity measurements for the animation :param PlotExtents: Time labels for images :return D: A fused NxN similarity matrix """ tic = time.time() # Full probability matrices Ps = [getP(W) for W in Ws] # Nearest neighbor truncated matrices Ss = [getS(W, K) for W in Ws] # Now do cross-diffusion iterations Pts = [np.array(P) for P in Ps] nextPts = [np.zeros(P.shape) for P in Pts] if verboseTimes: logger.debug("Time getting Ss and Ps: %g" % (time.time() - tic)) N = len(Pts) AllTimes = [] for it in range(niters): ticiter = time.time() for i in range(N): nextPts[i] *= 0 tic = time.time() for k in range(N): if i == k: continue nextPts[i] += Pts[k] nextPts[i] /= float(N - 1) # Need S*P*S^T, but have to multiply sparse matrix on the left tic = time.time() A = Ss[i].dot(nextPts[i].T) nextPts[i] = Ss[i].dot(A.T) toc = time.time() AllTimes.append(toc - tic) if reg_diag > 0: nextPts[i] += reg_diag * np.eye(nextPts[i].shape[0]) if reg_neighbs > 0: arr = np.arange(nextPts[i].shape[0]) [I, J] = np.meshgrid(arr, arr) # Add diagonal regularization as well nextPts[i][np.abs(I - J) == 1] += reg_neighbs Pts = nextPts if verboseTimes: logger.debug( "Elapsed Time Iter %i of %i: %g" % (it + 1, niters, time.time() - ticiter) ) if verboseTimes: logger.debug("Total Time multiplying: %g" % np.sum(np.array(AllTimes))) FusedScores = np.zeros(Pts[0].shape) for Pt in Pts: FusedScores += Pt return FusedScores / N
def preprocessor(self, wavPath, sr=SAMPLE_RATE): wavPath = os.path.abspath(wavPath) title = os.path.splitext(os.path.basename(wavPath))[0] tmpMel = os.path.join(ALGO_BASE_DIRS["TmpDir"], f"{title}_JDC_out.csv") logger.debug(f"convert wav={wavPath} to mel={tmpMel}") return self.SSL(wavPath, tmpMel)
def main(audiofiles, outputdir, metaoutputdir, algo, force, workers): logger.debug(f"algo={algo}") logger.info(f"preprocess to generate features") ddataset = DummyDataset(audiofiles) transforms = [ ExtractMel(), GenerateSSM(dataset=ddataset), ExtractCliques(dataset=ddataset), ] for tf in transforms: preDataset = Preprocess_Dataset(tf.identifier, ddataset) preDataset.build(tf.preprocessor, force=force, num_workers=workers) predictor = switchPred(algo) predictorStruct = (predictor if algo not in ["mixed", "highlighter"] else AlgoSeqRecur(trainFile=USE_MODEL_DIC["seqRecur"])) for i, pair in enumerate(ddataset.pathPairs): audioFileName, audiofile, _ = pair audiofile = os.path.abspath(audiofile) output = os.path.join(outputdir, audioFileName + ".txt") metaOutput = os.path.join(metaoutputdir, audioFileName + "_meta.json") ssm_f, mels_f = getFeatures(ddataset, i) cliques = predictorStruct._process(ddataset, i, ssm_f) mirexFmt = chorusDetection(cliques, ssm_f[0], mels_f, predictorStruct.clf) if algo == "multi": mirexFmt = tuneIntervals(mirexFmt, mels_f, chorusDur=CHORUS_DURATION, window=TUNE_WINDOW) elif algo == "single": mirexFmt = maxOverlap(mirexFmt, chorusDur=CHORUS_DURATION_SINGLE, centering=False) mirexFmt = tuneIntervals( mirexFmt, mels_f, chorusDur=CHORUS_DURATION_SINGLE, window=TUNE_WINDOW, ) # plot mats tf = ExtractCliques(dataset=ddataset) origCliques = Preprocess_Dataset(tf.identifier, ddataset, transform=tf.transform)[i]["cliques"] olssm = getLabeledSSM(origCliques, ssm_f[1].shape[-1]) lssm = getLabeledSSM(cliques, ssm_f[1].shape[-1]) olssm = drawSegments(mirexFmt, mirexFmt, olssm, ssm_f[0]) mats = np.array([ssm_f[1], lssm, olssm]) titles = ["fused SSM", "result structure", "low level structure"] plotMats(mats, titles, show=False) # write output and viewer metadata if algo not in ["single", "multi"]: mirexFmt = predictor(ddataset, i) mirexFmt = removeNumber(mirexFmt) mirexFmt = mergeIntervals(mirexFmt) writeMirexOutput(mirexFmt, output) figurePath = os.path.join(os.getcwd(), f"data/test/predict_{audioFileName}.svg") plt.savefig(figurePath, bbox_inches="tight") writeJsonMetadata(audiofile, mergeIntervals(mirexFmt), figurePath, metaOutput) if DEBUG: plt.show()
def printArray(arr, name, show=False): logger.debug(f"{name}{arr.shape}, min={np.min(arr)} max={np.max(arr)}") if show: plt.imshow(logSSM(arr), aspect="auto") plt.colorbar() plt.show()