Esempio n. 1
0
def multiChorusSections(intvs, dur):
    """avoid intersection of tuned intervals"""

    def key(x):
        # timestamp with precision of 0.01s
        return int(x * 100)

    # value 1=chorus begin -1=chorus end
    boundaries = defaultdict(int)
    for intv in intvs:
        boundaries[key(intv[0])] += 1
        boundaries[key(intv[1])] -= 1
    intervals, labels = [[0, 0]], [CLF_NON_TARGET_LABEL]
    state = 0  # 0:others >0:chorus
    for bdr in sorted(boundaries.keys()):
        t = bdr / 100.0
        intervals[-1][1] = t
        intervals.append([t, 0])
        state += boundaries[bdr]
        if state == 0:
            labels.append(CLF_NON_TARGET_LABEL)
        elif state > 0:
            labels.append(CLF_TARGET_LABEL)
        else:
            logger.error(f"invalid state, boundaries={boundaries}")
    intervals[-1][1] = dur
    mirexFmt = (np.array(intervals), np.array(labels, dtype="U16"))
    logger.debug(f"multi chorus sections, output=\n{mirexLines(mirexFmt)}")
    return mergeIntervals(mirexFmt)
Esempio n. 2
0
def buildRecurrence(cliques, times):
    logger.debug(f"build recurrence")
    cliques = deepcopy(cliques)
    size = len(times) - 1
    mergedCliquesList = [
        smoothCliques(
            mergeAdjacentCliques(cliques, dis=dis, dblock=dblock),
            size,
            kernel_size=kernelSize,
        )
        for dis in DELTA_DIS_RANGE
        for kernelSize in SMOOTH_KERNEL_SIZE_RANGE
        for dblock in [0, 1, 2]
    ]
    # mclen = len(mergedCliquesList)
    # for i in range(1):
    #     mergedCliquesList.extend(
    #         [
    #             smoothCliques(mergeAdjacentCliques(cs), size)
    #             for cs in mergedCliquesList[-mclen:]
    #         ]
    #     )
    errors = [error(cliques, ncs, size, times) for ncs in mergedCliquesList]
    indices = np.argsort(errors)
    for i in indices:
        newCliques = mergedCliquesList[i]
        predicate = all(
            [
                len(newCliques) >= MIN_STRUCTURE_COUNT,
            ]
        )
        if predicate:
            return newCliques
    logger.warn(f"seqrecur failed, cliqueLengths={[len(x) for x in mergedCliquesList]}")
    return mergedCliquesList[0]
 def SSL(self, wavPath, output, sr=SAMPLE_RATE):
     """<Semi-supervised learning using teacher-student models for vocal melody extraction>"""
     dirname = os.path.dirname(output) + "/"
     output = f"pitch_{os.path.basename(wavPath)}.txt"
     output = os.path.join(dirname, output)
     commands = ("python", "./melodyExtraction_NS.py", "-p", wavPath, "-o", dirname)
     logger.debug(f"SSL commands={commands}")
     ret = subprocess.call(commands, cwd=ALGO_BASE_DIRS["SSL"])
     assert ret == 0, f"return value: {ret} != 0"
     times, pitches = load_time_series(output, delimiter=r"\s+|,")
     return {"times": times, "pitches": pitches}
def tuneIntervals(mirexFmt, mels_f, chorusDur, window):
    mirexFmt = removeNumber(mirexFmt)
    mirexFmt = mergeIntervals(mirexFmt)
    logger.debug(f"tune interval=\n{mirexLines(mirexFmt)}")
    dur = mirexFmt[0][-1][1]
    intvs = filterIntvs(mirexFmt, fun=CLF_TARGET_LABEL)
    tuneIntvs = []
    times, pitches = mels_f
    for intv in intvs:
        begin = arousalPoint(intv[0], times, pitches, window, True)
        end = arousalPoint(intv[1], times, pitches, window, False)
        end = min(dur, max(end, begin + chorusDur))
        if end - begin > MINIMUM_CHORUS_DUR:
            tuneIntvs.append((begin, end))
    return multiChorusSections(tuneIntvs, dur)
Esempio n. 5
0
def feature2W(feature, size, aggregator, simFunction, wins_per_block=20, K=5):
    intervals = resize(feature, size)
    # feature[<dim>, <frame>] -> [<dim>, <interval number>], intervals=frames//(size-1)
    feature = librosa.util.sync(feature, intervals, aggregate=aggregator)
    # Xfeature[<interval number>, <dim>*<wins_per_block>]
    Xfeature = librosa.feature.stack_memory(feature,
                                            n_steps=wins_per_block,
                                            mode="edge").T
    Dfeature = simFunction(Xfeature, Xfeature)
    # Wfeature[<interval number>, <interval number>]
    Wfeature = getW(Dfeature, K)
    assert not np.isnan(
        np.sum(Wfeature)), f"invalid affinity, Dfeature={Dfeature}"
    logger.debug(
        f"shapes, feature{feature.shape} Xfeature{Xfeature.shape} Wfeature{Wfeature.shape}"
    )
    return Wfeature
Esempio n. 6
0
def error(origCliques, mergedCliques, size, times, show=False):
    olssm = getLabeledSSM(origCliques, size)
    mlssm = getLabeledSSM(mergedCliques, size)
    olssm[olssm > 0] = 1
    mlssm[mlssm > 0] = 1
    # false negative + false positive
    fnerr = np.sum((mlssm == 0) & olssm) / (np.sum(olssm) + EPSILON)
    fperr = np.sum(mlssm & (olssm == 0)) / (np.sum(olssm == 0) + EPSILON)
    err = fnerr + max(0, fperr - FALSE_POSITIVE_ERROR)
    logger.debug(f"errs={fnerr:.5f},{fperr:.5f} sum={err:.3f} len={len(mergedCliques)}")
    if show:
        x, xm = getLabeledSSM(origCliques, size), getLabeledSSM(mergedCliques, size)
        labels = [x[i, i] for i in range(size)]
        xm[xm > 0] = 10
        plt.imshow(x + xm)
        plt.plot(labels)
        plt.show()
    return err
Esempio n. 7
0
def mergeAdjacentCliques(cliques, dis=ADJACENT_DELTA_DISTANCE, dblock=0):
    logger.debug(f"merge cliques, dis={dis} dblock={dblock}")
    size = len(cliques)
    adjLists = [[] for i in range(size)]  # i < j: adjLists[j] = [..., i, ...]
    # calculate adjacency matrix
    for i in range(size):
        for j in range(i + 1, size):
            if isAdjacent(cliques[i], cliques[j], dis=dis, dblock=dblock):
                adjLists[j].append(i)
    # merge cliques in transitive closure
    # key:smallest clique label in connected component
    # value:frame number list
    cliquesDic = defaultdict(list)
    labels = mergeFind(adjLists, size)
    for i in range(size):
        cliquesDic[labels[i]].extend(cliques[i])

    newCliques = list(cliquesDic.values())
    newCliques = sorted(newCliques, key=lambda c: c[0])
    return newCliques
Esempio n. 8
0
def plotMats(matrices, titles, show=DEBUG):
    logger.debug(f"plot mats[{len(matrices)}]:")
    if len(matrices) > 3:
        _, axis = plt.subplots(2, (len(matrices) + 1) // 2)
    else:
        _, axis = plt.subplots(1, len(matrices))
    if len(matrices) == 1:
        axis = np.array([axis])
    axis = axis.flatten()
    for i, mat in enumerate(matrices):
        logger.debug(
            f"{titles[i]}{mat.shape}, min={np.min(mat)}, max={np.max(mat)}")
        ax = axis[i]
        ax.set_title(f"({string.ascii_lowercase[i]}) {titles[i]}")
        extent = [-1, len(mat) * SSM_TIME_STEP]
        ax.imshow(mat, interpolation="none", extent=extent + extent[::-1])
        ax.set_xlabel("time/s")
        # fig.colorbar(im, orientation=orien, ax=ax)
    plt.tight_layout()
    if show:
        plt.show()
def arousalPoint(time, times, pitches, window, begin, show=DEBUG):
    def arousalScore(t):
        before = pitches[(times >= t - TUNE_SCOPE / 2) & (times <= t)]
        after = pitches[(times >= t) & (times <= t + TUNE_SCOPE / 2)]
        before = (librosa.hz_to_midi(before + 0.1) * 6 / 12).astype(int)
        after = (librosa.hz_to_midi(after + 0.1) * 6 / 12).astype(int)
        score = np.sum(after) - np.sum(before)
        return score / len(before)

    mask = (times >= time - window / 2) & (times <= time + window / 2)
    scores = [arousalScore(t) for t in times[mask]]
    point = times[mask][np.argmax(scores)] if begin else times[mask][np.argmin(scores)]
    if show:
        logger.debug(
            f"point={point} times={times[mask][0]}~{times[mask][-1]} window={window}"
        )
        plt.plot(times[mask], pitches[mask], label="pitch")
        plt.plot(times[mask], scores, label="score")
        plt.scatter(point, np.max(scores) if begin else np.min(scores))
        plt.xlabel("time/s")
        plt.ylabel("freq/Hz")
        plt.legend()
        plt.show()
    return point
Esempio n. 10
0
def selfSimilarityMatrix(
    wavfile,
    mel=None,
    win_fac=10,
    wins_per_block=20,
    K=5,
    sr=22050,
    hop_length=512,
):
    logger.debug(f"loading:{wavfile}")
    y, sr = librosa.load(wavfile, sr=sr)
    nHops = (y.size - hop_length * (win_fac - 1)) / hop_length
    intervals = np.arange(0, nHops + 1e-6, win_fac).astype(int)
    logger.debug(
        f"nHops={nHops}=(size-hop_length*(win_fac-1))/hop_length=({y.size} - {hop_length}*({win_fac}-1))/{hop_length} intvs={intervals[-1]}"
    )
    # chorma
    chroma = librosa.feature.chroma_cqt(y=y,
                                        sr=sr,
                                        hop_length=hop_length,
                                        bins_per_octave=12 * 3)
    # mfcc
    S = librosa.feature.melspectrogram(y,
                                       sr=sr,
                                       n_mels=128,
                                       hop_length=hop_length)
    log_S = librosa.power_to_db(S, ref=np.max)
    mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20)
    lifterexp = 0.6
    coeffs = np.arange(mfcc.shape[0])**lifterexp
    coeffs[0] = 1
    mfcc = coeffs[:, None] * mfcc
    # tempogram
    SUPERFLUX_SIZE = 5
    oenv = librosa.onset.onset_strength(y=y,
                                        sr=sr,
                                        hop_length=hop_length,
                                        max_size=SUPERFLUX_SIZE)
    tempogram = librosa.feature.tempogram(onset_envelope=oenv,
                                          sr=sr,
                                          hop_length=hop_length)

    # generate W- matrices
    n_frames = np.min([chroma.shape[1], mfcc.shape[1], tempogram.shape[1]])
    intervals = librosa.util.fix_frames(intervals, x_min=0, x_max=n_frames)
    times = intervals * float(hop_length) / float(sr)
    size = n_frames // win_fac
    logger.debug(
        f"frames fixed, intervals={intervals[-1]} hop={intervals[1]-intervals[0]} size={size}"
    )
    WMfcc = feature2W(mfcc,
                      size,
                      np.mean,
                      getCSM,
                      wins_per_block=wins_per_block)
    WChroma = feature2W(
        chroma,
        size,
        np.median,
        getShiftInvariantCSM(getCSMCosine, wins_per_block),
        wins_per_block=wins_per_block,
    )
    WTempo = feature2W(tempogram,
                       size,
                       np.mean,
                       getCSM,
                       wins_per_block=wins_per_block)
    printArray(WMfcc, "mfcc")
    printArray(WChroma, "chorma")
    printArray(WTempo, "tempo")

    # melody
    if mel is not None:
        _, pitches = mel
        pitches = pitchChroma(pitches)
        WPitches = feature2W(
            pitches,
            size,
            np.median,
            getShiftInvariantCSM(getCSMCosine, wins_per_block),
            wins_per_block=wins_per_block,
        )
        printArray(WPitches, "pitchChroma")
        Ws = [WMfcc, WChroma, WPitches, WTempo]
    else:
        Ws = [WMfcc, WChroma, WTempo]

    if REC_SMOOTH > 0:
        df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
        Ws = [df(W, size=(1, REC_SMOOTH)) for W in Ws]
    W = doSimilarityFusionWs(Ws, K=K, niters=3, reg_diag=1.0, reg_neighbs=0.5)
    printArray(W, "fused W")
    res = {
        "Ws": {
            "Fused": W,
            "Melody": WPitches if mel is not None else None,
        },
        "times": times,
    }
    return res
def doSimilarityFusionWs(
    Ws,
    K=5,
    niters=20,
    reg_diag=1,
    reg_neighbs=0.5,
    verboseTimes=True,
):
    """
    Perform similarity fusion between a set of exponentially
    weighted similarity matrices
    :param Ws: An array of NxN affinity matrices for N songs
    :param K: Number of nearest neighbors
    :param niters: Number of iterations
    :param reg_diag: Identity matrix regularization parameter for
        self-similarity promotion
    :param reg_neighbs: Neighbor regularization parameter for promoting
        adjacencies in time
    :param PlotNames: Strings describing different similarity
        measurements for the animation
    :param PlotExtents: Time labels for images
    :return D: A fused NxN similarity matrix
    """
    tic = time.time()
    # Full probability matrices
    Ps = [getP(W) for W in Ws]
    # Nearest neighbor truncated matrices
    Ss = [getS(W, K) for W in Ws]

    # Now do cross-diffusion iterations
    Pts = [np.array(P) for P in Ps]
    nextPts = [np.zeros(P.shape) for P in Pts]
    if verboseTimes:
        logger.debug("Time getting Ss and Ps: %g" % (time.time() - tic))

    N = len(Pts)
    AllTimes = []
    for it in range(niters):
        ticiter = time.time()
        for i in range(N):
            nextPts[i] *= 0
            tic = time.time()
            for k in range(N):
                if i == k:
                    continue
                nextPts[i] += Pts[k]
            nextPts[i] /= float(N - 1)

            # Need S*P*S^T, but have to multiply sparse matrix on the left
            tic = time.time()
            A = Ss[i].dot(nextPts[i].T)
            nextPts[i] = Ss[i].dot(A.T)
            toc = time.time()
            AllTimes.append(toc - tic)
            if reg_diag > 0:
                nextPts[i] += reg_diag * np.eye(nextPts[i].shape[0])
            if reg_neighbs > 0:
                arr = np.arange(nextPts[i].shape[0])
                [I, J] = np.meshgrid(arr, arr)
                # Add diagonal regularization as well
                nextPts[i][np.abs(I - J) == 1] += reg_neighbs

        Pts = nextPts
        if verboseTimes:
            logger.debug(
                "Elapsed Time Iter %i of %i: %g"
                % (it + 1, niters, time.time() - ticiter)
            )
    if verboseTimes:
        logger.debug("Total Time multiplying: %g" % np.sum(np.array(AllTimes)))
    FusedScores = np.zeros(Pts[0].shape)
    for Pt in Pts:
        FusedScores += Pt
    return FusedScores / N
 def preprocessor(self, wavPath, sr=SAMPLE_RATE):
     wavPath = os.path.abspath(wavPath)
     title = os.path.splitext(os.path.basename(wavPath))[0]
     tmpMel = os.path.join(ALGO_BASE_DIRS["TmpDir"], f"{title}_JDC_out.csv")
     logger.debug(f"convert wav={wavPath} to mel={tmpMel}")
     return self.SSL(wavPath, tmpMel)
Esempio n. 13
0
def main(audiofiles, outputdir, metaoutputdir, algo, force, workers):
    logger.debug(f"algo={algo}")
    logger.info(f"preprocess to generate features")
    ddataset = DummyDataset(audiofiles)
    transforms = [
        ExtractMel(),
        GenerateSSM(dataset=ddataset),
        ExtractCliques(dataset=ddataset),
    ]
    for tf in transforms:
        preDataset = Preprocess_Dataset(tf.identifier, ddataset)
        preDataset.build(tf.preprocessor, force=force, num_workers=workers)

    predictor = switchPred(algo)
    predictorStruct = (predictor if algo not in ["mixed", "highlighter"] else
                       AlgoSeqRecur(trainFile=USE_MODEL_DIC["seqRecur"]))
    for i, pair in enumerate(ddataset.pathPairs):
        audioFileName, audiofile, _ = pair
        audiofile = os.path.abspath(audiofile)
        output = os.path.join(outputdir, audioFileName + ".txt")
        metaOutput = os.path.join(metaoutputdir, audioFileName + "_meta.json")

        ssm_f, mels_f = getFeatures(ddataset, i)
        cliques = predictorStruct._process(ddataset, i, ssm_f)
        mirexFmt = chorusDetection(cliques, ssm_f[0], mels_f,
                                   predictorStruct.clf)
        if algo == "multi":
            mirexFmt = tuneIntervals(mirexFmt,
                                     mels_f,
                                     chorusDur=CHORUS_DURATION,
                                     window=TUNE_WINDOW)
        elif algo == "single":
            mirexFmt = maxOverlap(mirexFmt,
                                  chorusDur=CHORUS_DURATION_SINGLE,
                                  centering=False)
            mirexFmt = tuneIntervals(
                mirexFmt,
                mels_f,
                chorusDur=CHORUS_DURATION_SINGLE,
                window=TUNE_WINDOW,
            )

        # plot mats
        tf = ExtractCliques(dataset=ddataset)
        origCliques = Preprocess_Dataset(tf.identifier,
                                         ddataset,
                                         transform=tf.transform)[i]["cliques"]
        olssm = getLabeledSSM(origCliques, ssm_f[1].shape[-1])
        lssm = getLabeledSSM(cliques, ssm_f[1].shape[-1])
        olssm = drawSegments(mirexFmt, mirexFmt, olssm, ssm_f[0])
        mats = np.array([ssm_f[1], lssm, olssm])
        titles = ["fused SSM", "result structure", "low level structure"]
        plotMats(mats, titles, show=False)

        # write output and viewer metadata
        if algo not in ["single", "multi"]:
            mirexFmt = predictor(ddataset, i)
            mirexFmt = removeNumber(mirexFmt)
            mirexFmt = mergeIntervals(mirexFmt)

        writeMirexOutput(mirexFmt, output)
        figurePath = os.path.join(os.getcwd(),
                                  f"data/test/predict_{audioFileName}.svg")
        plt.savefig(figurePath, bbox_inches="tight")
        writeJsonMetadata(audiofile, mergeIntervals(mirexFmt), figurePath,
                          metaOutput)
        if DEBUG:
            plt.show()
Esempio n. 14
0
def printArray(arr, name, show=False):
    logger.debug(f"{name}{arr.shape}, min={np.min(arr)} max={np.max(arr)}")
    if show:
        plt.imshow(logSSM(arr), aspect="auto")
        plt.colorbar()
        plt.show()