Beispiel #1
0
def fit_PCA(maindir,
            d,
            origcodes_f="msd_codes_k2045",
            outpca="PCA-codes.pk",
            N=50000,
            norm=False,
            pca_components=[50, 100, 200, 500, 1000]):
    """Fits a PCA transformation with N codes."""
    import binary_task as B
    import cover_id_test as CO

    logger = configure_logger()

    td = load_transform(d)
    codes = np.ones((N, 2045)) * np.nan
    k = 0

    # Load codes
    origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30)

    while k < N:
        track_idx = np.random.random_integers(0, len(track_ids) - 1)
        while track_ids[track_idx] == -2:
            track_idx = np.random.random_integers(0, len(track_ids) - 1)
        track_id = track_ids[track_idx]
        code = origcodes[track_idx]
        if code is not None:
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0],
                                                        1)).squeeze()
            codes[k] = code
            # Marked as used
            track_ids[track_idx] = -2
            k += 1

        if k % 100 == 0:
            logger.info("----Computing features %.1f%%" % \
                            (k/float(N) * 100))

    # Remove nans
    nan_idx = np.unique(np.where(np.isnan(codes))[0])
    codes = np.delete(codes, nan_idx, axis=0)

    # Fit PCA
    res = []
    codes = np.asarray(codes)
    for c in pca_components:
        pca = PCA(n_components=c)
        pca.fit(codes)
        res.append(pca)

    # Save Result
    save_pickle(res, outpca)
def compute_codes_it(track_ids,
                     maindir,
                     d,
                     clique_ids,
                     start_idx,
                     end_idx,
                     origcodes=None,
                     norm=False):
    """Computes the features based on Humphrey, Nieto and Bello, 2013.
    Dimensionality reduction using LDA of 50, 100, and 200 components."""
    fx = load_transform(d)
    res = []
    K = int(d.split("_")[1].split("E")[1])

    # Init codes
    codes = []
    if lda is not None:
        lda_components = [50, 100, 200]
        for n_comp in lda_components:
            codes.append(np.ones((end_idx - start_idx, n_comp)) * np.nan)
    else:
        codes.append(np.ones((end_idx - start_idx, K)) * np.nan)

    for i, tid in enumerate(track_ids[start_idx:end_idx]):
        if origcodes is None:
            path = utils.path_from_tid(maindir, tid)
            feats = utils.extract_feats(path)
            if feats == None:
                continue
            code = np.median(fx(feats), axis=0)
        else:
            code = origcodes[i]
        if norm:
            code = dan_tools.chromnorm(code.reshape(code.shape[0],
                                                    1)).squeeze()
        if pca is not None:
            code = pca.transform(code)
        if lda is not None:
            for lda_idx, n_comp in enumerate(lda_components):
                tmp = lda[lda_idx].transform(code)
                codes[lda_idx][i] = dan_tools.chromnorm(
                    tmp.reshape(tmp.shape[0], 1)).squeeze()
        else:
            codes[0][i] = code
        if i % 1000 == 0:
            logger.info("Computed %d of %d track(s)" %
                        (i, end_idx - start_idx))
    res = (codes, track_ids[start_idx:end_idx], clique_ids[start_idx:end_idx])
    return res
def fit_PCA(maindir, d, origcodes_f="msd_codes_k2045", outpca="PCA-codes.pk", 
        N=50000, norm=False, pca_components=[50, 100, 200, 500, 1000]):
    """Fits a PCA transformation with N codes."""
    import binary_task as B
    import cover_id_test as CO

    logger = configure_logger()

    td = load_transform(d)
    codes = np.ones((N, 2045)) * np.nan
    k = 0

    # Load codes
    origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30)

    while k < N:
        track_idx = np.random.random_integers(0,len(track_ids)-1)
        while  track_ids[track_idx] == -2:
            track_idx = np.random.random_integers(0,len(track_ids)-1)
        track_id = track_ids[track_idx]
        code = origcodes[track_idx]
        if code is not None:
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze()
            codes[k] = code
            # Marked as used
            track_ids[track_idx] = -2
            k += 1

        if k % 100 == 0:
            logger.info("----Computing features %.1f%%" % \
                            (k/float(N) * 100))

    # Remove nans
    nan_idx = np.unique(np.where(np.isnan(codes))[0])
    codes = np.delete(codes, nan_idx, axis=0)

    # Fit PCA
    res = []
    codes = np.asarray(codes)
    for c in pca_components:
        pca = PCA(n_components=c)
        pca.fit(codes)
        res.append(pca)

    # Save Result
    save_pickle(res, outpca)
def compute_codes_it(track_ids, maindir, d, clique_ids, 
        start_idx, end_idx, origcodes=None, norm=False):
    """Computes the features based on Humphrey, Nieto and Bello, 2013.
    Dimensionality reduction using LDA of 50, 100, and 200 components."""
    fx = load_transform(d)
    res = []
    K = int(d.split("_")[1].split("E")[1])

    # Init codes
    codes = []
    if lda is not None:
        lda_components = [50,100,200]
        for n_comp in lda_components:
            codes.append(np.ones((end_idx-start_idx,n_comp)) * np.nan)
    else:
        codes.append(np.ones((end_idx-start_idx, K)) * np.nan)

    for i, tid in enumerate(track_ids[start_idx:end_idx]):
        if origcodes is None:
            path = utils.path_from_tid(maindir, tid)
            feats = utils.extract_feats(path)
            if feats == None:
                continue
            code = np.median(fx(feats), axis=0)
        else:
            code = origcodes[i]
        if norm:
            code = dan_tools.chromnorm(code.reshape(code.shape[0], 
                                        1)).squeeze()
        if pca is not None:
            code = pca.transform(code)
        if lda is not None:
            for lda_idx, n_comp in enumerate(lda_components):
                tmp = lda[lda_idx].transform(code)
                codes[lda_idx][i] = dan_tools.chromnorm(tmp.reshape(tmp.shape[0], 
                                        1)).squeeze()
        else:
            codes[0][i] = code 
        if i % 1000 == 0:
            logger.info("Computed %d of %d track(s)" % (i, end_idx-start_idx))
    res = (codes, track_ids[start_idx:end_idx], clique_ids[start_idx:end_idx])
    return res
def main():
    # Args parser
    parser = argparse.ArgumentParser(
        description="Evaluates the 500 binary queries from the SHS data set",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("msd_dir",
                        action="store",
                        help="Million Song Dataset main directory")
    parser.add_argument("-dictfile",
                        action="store",
                        default="",
                        help="Pickle to the learned dictionary")
    parser.add_argument("-lda",
                        action="store",
                        nargs=2,
                        default=[None, 0],
                        help="LDA file and version",
                        metavar=('lda.pkl', 'n'))
    parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'),
                        default=("", 0),
                        help="pca model saved in a pickle file, " \
                        "use n dimensions")
    # Parse
    args = parser.parse_args()

    # Track time
    start_time = time.time()

    maindir = args.msd_dir
    queriesf = "SHS/list_500queries.txt"
    shsf = "SHS/shs_dataset_train.txt"
    lda = args.lda[0]
    lda_n = int(args.lda[1])
    pcafile = args.pca[0]
    pcadim = int(args.pca[1])

    # sanity cheks
    utils.assert_file(maindir)
    utils.assert_file(queriesf)
    utils.assert_file(shsf)
    utils.assert_file(pcafile)

    # read queries
    queries = read_query_file(queriesf)

    # load pca
    trainedpca = None
    if pcafile != "":
        f = open(pcafile, 'r')
        trainedpca = cPickle.load(f)
        f.close()
        assert pcadim > 0
        logger.info('trained pca loaded')

    # load lda
    if lda != None:
        lda = utils.load_pickle(lda)

    # to keep stats
    results = []

    # iterate over queries
    logger.info("Starting the binary task...")

    # Get the dictionary transform
    td = load_transform(args.dictfile)

    for triplet in queries:
        # get features
        filenames = map(lambda tid: utils.path_from_tid(maindir, tid), triplet)
        triplet_feats = map(
            lambda f: extract_feats(f, td=td, lda_file=lda, lda_n=lda_n),
            filenames)
        if None in triplet_feats:
            continue

        # Apply pca if needed
        if trainedpca:
            triplet_feats = map(lambda feat: \
                                trainedpca.apply_newdata(feat, ndims=pcadim),
                                triplet_feats)
            assert triplet_feats[np.random.randint(3)].shape[0] == pcadim

        # Compute result
        res1 = triplet_feats[0] - triplet_feats[1]
        res1 = np.sum(res1 * res1)
        res2 = triplet_feats[0] - triplet_feats[2]
        res2 = np.sum(res2 * res2)
        if res1 < res2:
            results.append(1)
        else:
            results.append(0)

        # verbose
        if len(results) % 5 == 0:
            logger.info(' --- after %d queries, accuracy: %.1f %%' % \
                            (len(results), 100. * np.mean(results)))
    # done
    logger.info('After %d queries, accuracy: %.1f %%' %
                (len(results), 100. * np.mean(results)))
    logger.info('Done! Took %.2f seconds' % (time.time() - start_time))
def fit_LDA_filter(maindir, d, codes_f, N=9000, n=9, pca=None, pca_n=0, 
        norm=False, outlda="LDAs.pk", lda_components=[50,100,200]):
    """Fits an LDA with a filtered version of the dataset, such that each
        clique contains at least n tracks."""

    import cover_id_test as CO

    clique_test = load_pickle("SHS/clique_ids_test.pk")
    clique_train = load_pickle("SHS/clique_ids_train.pk")
    track_test = load_pickle("SHS/track_ids_test.pk")
    track_train = load_pickle("SHS/track_ids_train.pk")

    # Result to 
    codes = []
    labels = []

    if pca is not None:
        P = load_pickle(pca)
        P = P[pca_n]

    C = CO.load_codes(codes_f, -1, 30)
    C = C[0]

    # Load the codes from the training set
    codestrain = load_pickle("results/codes-shs-train-k2045.pk")

    clique_idx = 0
    label_id = 1000001

    td = load_transform(d)

    while len(codes) < N:
        # Pick the tracks from the train set that belong to a
        # clique that has at least n tracks
        if clique_idx < len(clique_train):
            while clique_idx < len(clique_train) and \
                    len(np.where(clique_train == clique_train[clique_idx])[0]) < n :
                clique_idx += 1

            if clique_idx < len(clique_train) and clique_train[clique_idx] != -2:
                for clique_id in \
                        np.where(clique_train == clique_train[clique_idx])[0]:
                    code = codestrain[clique_id]
                    if norm:
                        code = dan_tools.chromnorm(code.reshape(code.shape[0], 
                                        1)).squeeze()
                    clique_train[clique_id] = -2
                    if code is None:
                        continue
                    if pca is not None:
                        code = P.transform(code)
                    codes.append( code )
                    labels.append( clique_idx )

            clique_idx += 1

        # Pick random tracks from the MSD and assign new labels
        else:
            clique_id = np.random.random_integers(0, len(C)-1)
            while np.any(np.equal(C[clique_id], None)) or clique_test[clique_id] == -2:
                clique_id = np.random.random_integers(0, len(C)-1)
            code = C[clique_id]
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0], 
                                        1)).squeeze()
            if pca is not None:
                code = P.transform(code)
            codes.append( code )
            labels.append( label_id )
            label_id += 1
            clique_test[clique_id] = -2

        print "Computed %d out of %d codes" % (len(codes), N)

    codes_pk = "codes_filter_LDA_PCA.pk"
    cliques_pk = "cliques_filter_LDA_PCA.pk"
    save_pickle(codes, codes_pk)
    save_pickle(labels, cliques_pk)

    time.sleep(3)

    # fit LDA and save model
    fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)
Beispiel #7
0
def compute_feats(track_ids,
                  maindir,
                  d,
                  lda_file=None,
                  lda_n=0,
                  codes=None,
                  ver=True,
                  pca="",
                  pca_n=0):
    """Computes the features using the dictionary d. If it doesn't exist, 
     computes them using Thierry's method.

     The improved pipeline is composed of 11 steps:

        1.- Beat Synchronous Chroma
        2.- L2-Norm
        3.- Shingle (PATCH_LEN: 75 x 12)
        4.- 2D-FFT
        5.- L2-Norm
        6.- Log-Scale
        7.- Sparse Coding
        8.- Shrinkage
        9.- Median Aggregation
        10.- Dimensionality Reduction
        11.- L2-Norm

    Original method by Thierry doesn't include steps 5,6,7,8,11.
     """
    if d != "":
        fx = load_transform(d)
        K = int(d.split("_")[1].split("E")[1])
    else:
        K = PATCH_LEN

    if codes is None:
        compute_codes = True
        codes = np.ones((len(track_ids), K)) * np.nan
    else:
        compute_codes = False
        K = codes[0].shape[0]
    if lda_file is not None:
        if lda_n == 0: n_comp = 50
        elif lda_n == 1: n_comp = 100
        elif lda_n == 2: n_comp = 200
    else:
        n_comp = K

    if pca != "":
        pca = utils.load_pickle(pca)
        pca = pca[pca_n]

    final_feats = np.ones((codes.shape[0], n_comp)) * np.nan
    orig_feats = []
    for cnt, tid in enumerate(track_ids):
        if compute_codes:
            path = utils.path_from_tid(maindir, tid)

            # 1.- Beat Synchronous Chroma
            # 2.- L2-Norm
            # 3.- Shingle (PATCH_LEN: 75 x 12)
            # 4.- 2D-FFT
            feats = utils.extract_feats(path)
            #orig_feats.append(feats)    # Store orig feats
            if feats == None:
                continue

            if d != "":
                # 5.- L2-Norm
                # 6.- Log-Scale
                # 7.- Sparse Coding
                # 8.- Shrinkage
                H = fx(feats)
            else:
                H = feats
            #. 9.- Median Aggregation
            H = np.median(H, axis=0)
        else:
            H = codes[cnt]

        if compute_codes:
            codes[cnt] = H.copy()

        if pca != "":
            H = pca.transform(H)

        # Apply LDA if needed
        if lda_file is not None:
            #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()
            # 10.- Dimensionality Reduction
            H = lda_file[lda_n].transform(H)

        # 11.- L2-Norm
        final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0],
                                                         1)).squeeze()

        if ver:
            if cnt % 50 == 1:
                logger.info("----Computing features %.1f%%" % \
                            (cnt/float(len(track_ids)) * 100))

    if d == "":
        d = "orig"  # For saving purposes

    # Save codes
    utils.create_dir("results")
    if compute_codes:
        utils.save_pickle(codes,
                          "results/codes-" + os.path.basename(d) + ".pk")

    # Save features
    #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk")

    logger.info("Features Computed")
    return final_feats
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, 
        ver=True, pca="", pca_n=0):
    """Computes the features using the dictionary d. If it doesn't exist, 
     computes them using Thierry's method.

     The improved pipeline is composed of 11 steps:

        1.- Beat Synchronous Chroma
        2.- L2-Norm
        3.- Shingle (PATCH_LEN: 75 x 12)
        4.- 2D-FFT
        5.- L2-Norm
        6.- Log-Scale
        7.- Sparse Coding
        8.- Shrinkage
        9.- Median Aggregation
        10.- Dimensionality Reduction
        11.- L2-Norm

    Original method by Thierry doesn't include steps 5,6,7,8,11.
     """
    if d != "":
        fx = load_transform(d)
        K = int(d.split("_")[1].split("E")[1])
    else:
        K = PATCH_LEN
    
    if codes is None:
        compute_codes = True
        codes = np.ones((len(track_ids),K)) * np.nan
    else:
        compute_codes = False
        K = codes[0].shape[0]
    if lda_file is not None:
        if lda_n == 0: n_comp = 50
        elif lda_n == 1: n_comp = 100
        elif lda_n == 2: n_comp = 200
    else:
        n_comp = K 

    if pca != "":
        pca = utils.load_pickle(pca)
        pca = pca[pca_n]

    final_feats = np.ones((codes.shape[0],n_comp)) * np.nan
    orig_feats = []
    for cnt, tid in enumerate(track_ids):
        if compute_codes:
            path = utils.path_from_tid(maindir, tid)

            # 1.- Beat Synchronous Chroma
            # 2.- L2-Norm
            # 3.- Shingle (PATCH_LEN: 75 x 12)
            # 4.- 2D-FFT
            feats = utils.extract_feats(path)
            #orig_feats.append(feats)    # Store orig feats
            if feats == None:
                continue
            
            if d != "":
                # 5.- L2-Norm
                # 6.- Log-Scale
                # 7.- Sparse Coding
                # 8.- Shrinkage
                H = fx(feats)
            else:
                H = feats
            #. 9.- Median Aggregation
            H = np.median(H, axis=0)
        else:
            H = codes[cnt]

        if compute_codes:
            codes[cnt] = H.copy()

        if pca != "":
            H = pca.transform(H)

        # Apply LDA if needed
        if lda_file is not None:
            #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()
            # 10.- Dimensionality Reduction
            H = lda_file[lda_n].transform(H)

        # 11.- L2-Norm
        final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze()

        if ver:
            if cnt % 50 == 1:
                logger.info("----Computing features %.1f%%" % \
                            (cnt/float(len(track_ids)) * 100))

    if d == "":
        d = "orig" # For saving purposes
    
    # Save codes
    utils.create_dir("results")
    if compute_codes:
        utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk")

    # Save features
    #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk")

    logger.info("Features Computed")
    return final_feats
Beispiel #9
0
def fit_LDA_filter(maindir,
                   d,
                   codes_f,
                   N=9000,
                   n=9,
                   pca=None,
                   pca_n=0,
                   norm=False,
                   outlda="LDAs.pk",
                   lda_components=[50, 100, 200]):
    """Fits an LDA with a filtered version of the dataset, such that each
        clique contains at least n tracks."""

    import cover_id_test as CO

    clique_test = load_pickle("SHS/clique_ids_test.pk")
    clique_train = load_pickle("SHS/clique_ids_train.pk")
    track_test = load_pickle("SHS/track_ids_test.pk")
    track_train = load_pickle("SHS/track_ids_train.pk")

    # Result to
    codes = []
    labels = []

    if pca is not None:
        P = load_pickle(pca)
        P = P[pca_n]

    C = CO.load_codes(codes_f, -1, 30)
    C = C[0]

    # Load the codes from the training set
    codestrain = load_pickle("results/codes-shs-train-k2045.pk")

    clique_idx = 0
    label_id = 1000001

    td = load_transform(d)

    while len(codes) < N:
        # Pick the tracks from the train set that belong to a
        # clique that has at least n tracks
        if clique_idx < len(clique_train):
            while clique_idx < len(clique_train) and \
                    len(np.where(clique_train == clique_train[clique_idx])[0]) < n :
                clique_idx += 1

            if clique_idx < len(
                    clique_train) and clique_train[clique_idx] != -2:
                for clique_id in \
                        np.where(clique_train == clique_train[clique_idx])[0]:
                    code = codestrain[clique_id]
                    if norm:
                        code = dan_tools.chromnorm(
                            code.reshape(code.shape[0], 1)).squeeze()
                    clique_train[clique_id] = -2
                    if code is None:
                        continue
                    if pca is not None:
                        code = P.transform(code)
                    codes.append(code)
                    labels.append(clique_idx)

            clique_idx += 1

        # Pick random tracks from the MSD and assign new labels
        else:
            clique_id = np.random.random_integers(0, len(C) - 1)
            while np.any(np.equal(C[clique_id],
                                  None)) or clique_test[clique_id] == -2:
                clique_id = np.random.random_integers(0, len(C) - 1)
            code = C[clique_id]
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0],
                                                        1)).squeeze()
            if pca is not None:
                code = P.transform(code)
            codes.append(code)
            labels.append(label_id)
            label_id += 1
            clique_test[clique_id] = -2

        print "Computed %d out of %d codes" % (len(codes), N)

    codes_pk = "codes_filter_LDA_PCA.pk"
    cliques_pk = "cliques_filter_LDA_PCA.pk"
    save_pickle(codes, codes_pk)
    save_pickle(labels, cliques_pk)

    time.sleep(3)

    # fit LDA and save model
    fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)
def main():
    # Args parser
    parser = argparse.ArgumentParser(
        description="Evaluates the 500 binary queries from the SHS data set",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory")
    parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary")
    parser.add_argument(
        "-lda", action="store", nargs=2, default=[None, 0], help="LDA file and version", metavar=("lda.pkl", "n")
    )
    parser.add_argument(
        "-pca",
        nargs=2,
        metavar=("f.pkl", "n"),
        default=("", 0),
        help="pca model saved in a pickle file, " "use n dimensions",
    )
    # Parse
    args = parser.parse_args()

    # Track time
    start_time = time.time()

    maindir = args.msd_dir
    queriesf = "SHS/list_500queries.txt"
    shsf = "SHS/shs_dataset_train.txt"
    lda = args.lda[0]
    lda_n = int(args.lda[1])
    pcafile = args.pca[0]
    pcadim = int(args.pca[1])

    # sanity cheks
    utils.assert_file(maindir)
    utils.assert_file(queriesf)
    utils.assert_file(shsf)
    utils.assert_file(pcafile)

    # read queries
    queries = read_query_file(queriesf)

    # load pca
    trainedpca = None
    if pcafile != "":
        f = open(pcafile, "r")
        trainedpca = cPickle.load(f)
        f.close()
        assert pcadim > 0
        logger.info("trained pca loaded")

    # load lda
    if lda != None:
        lda = utils.load_pickle(lda)

    # to keep stats
    results = []

    # iterate over queries
    logger.info("Starting the binary task...")

    # Get the dictionary transform
    td = load_transform(args.dictfile)

    for triplet in queries:
        # get features
        filenames = map(lambda tid: utils.path_from_tid(maindir, tid), triplet)
        triplet_feats = map(lambda f: extract_feats(f, td=td, lda_file=lda, lda_n=lda_n), filenames)
        if None in triplet_feats:
            continue

        # Apply pca if needed
        if trainedpca:
            triplet_feats = map(lambda feat: trainedpca.apply_newdata(feat, ndims=pcadim), triplet_feats)
            assert triplet_feats[np.random.randint(3)].shape[0] == pcadim

        # Compute result
        res1 = triplet_feats[0] - triplet_feats[1]
        res1 = np.sum(res1 * res1)
        res2 = triplet_feats[0] - triplet_feats[2]
        res2 = np.sum(res2 * res2)
        if res1 < res2:
            results.append(1)
        else:
            results.append(0)

        # verbose
        if len(results) % 5 == 0:
            logger.info(" --- after %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results)))
    # done
    logger.info("After %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results)))
    logger.info("Done! Took %.2f seconds" % (time.time() - start_time))