def fit_PCA(maindir, d, origcodes_f="msd_codes_k2045", outpca="PCA-codes.pk", N=50000, norm=False, pca_components=[50, 100, 200, 500, 1000]): """Fits a PCA transformation with N codes.""" import binary_task as B import cover_id_test as CO logger = configure_logger() td = load_transform(d) codes = np.ones((N, 2045)) * np.nan k = 0 # Load codes origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30) while k < N: track_idx = np.random.random_integers(0, len(track_ids) - 1) while track_ids[track_idx] == -2: track_idx = np.random.random_integers(0, len(track_ids) - 1) track_id = track_ids[track_idx] code = origcodes[track_idx] if code is not None: if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() codes[k] = code # Marked as used track_ids[track_idx] = -2 k += 1 if k % 100 == 0: logger.info("----Computing features %.1f%%" % \ (k/float(N) * 100)) # Remove nans nan_idx = np.unique(np.where(np.isnan(codes))[0]) codes = np.delete(codes, nan_idx, axis=0) # Fit PCA res = [] codes = np.asarray(codes) for c in pca_components: pca = PCA(n_components=c) pca.fit(codes) res.append(pca) # Save Result save_pickle(res, outpca)
def compute_codes_it(track_ids, maindir, d, clique_ids, start_idx, end_idx, origcodes=None, norm=False): """Computes the features based on Humphrey, Nieto and Bello, 2013. Dimensionality reduction using LDA of 50, 100, and 200 components.""" fx = load_transform(d) res = [] K = int(d.split("_")[1].split("E")[1]) # Init codes codes = [] if lda is not None: lda_components = [50, 100, 200] for n_comp in lda_components: codes.append(np.ones((end_idx - start_idx, n_comp)) * np.nan) else: codes.append(np.ones((end_idx - start_idx, K)) * np.nan) for i, tid in enumerate(track_ids[start_idx:end_idx]): if origcodes is None: path = utils.path_from_tid(maindir, tid) feats = utils.extract_feats(path) if feats == None: continue code = np.median(fx(feats), axis=0) else: code = origcodes[i] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() if pca is not None: code = pca.transform(code) if lda is not None: for lda_idx, n_comp in enumerate(lda_components): tmp = lda[lda_idx].transform(code) codes[lda_idx][i] = dan_tools.chromnorm( tmp.reshape(tmp.shape[0], 1)).squeeze() else: codes[0][i] = code if i % 1000 == 0: logger.info("Computed %d of %d track(s)" % (i, end_idx - start_idx)) res = (codes, track_ids[start_idx:end_idx], clique_ids[start_idx:end_idx]) return res
def fit_PCA(maindir, d, origcodes_f="msd_codes_k2045", outpca="PCA-codes.pk", N=50000, norm=False, pca_components=[50, 100, 200, 500, 1000]): """Fits a PCA transformation with N codes.""" import binary_task as B import cover_id_test as CO logger = configure_logger() td = load_transform(d) codes = np.ones((N, 2045)) * np.nan k = 0 # Load codes origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30) while k < N: track_idx = np.random.random_integers(0,len(track_ids)-1) while track_ids[track_idx] == -2: track_idx = np.random.random_integers(0,len(track_ids)-1) track_id = track_ids[track_idx] code = origcodes[track_idx] if code is not None: if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() codes[k] = code # Marked as used track_ids[track_idx] = -2 k += 1 if k % 100 == 0: logger.info("----Computing features %.1f%%" % \ (k/float(N) * 100)) # Remove nans nan_idx = np.unique(np.where(np.isnan(codes))[0]) codes = np.delete(codes, nan_idx, axis=0) # Fit PCA res = [] codes = np.asarray(codes) for c in pca_components: pca = PCA(n_components=c) pca.fit(codes) res.append(pca) # Save Result save_pickle(res, outpca)
def compute_codes_it(track_ids, maindir, d, clique_ids, start_idx, end_idx, origcodes=None, norm=False): """Computes the features based on Humphrey, Nieto and Bello, 2013. Dimensionality reduction using LDA of 50, 100, and 200 components.""" fx = load_transform(d) res = [] K = int(d.split("_")[1].split("E")[1]) # Init codes codes = [] if lda is not None: lda_components = [50,100,200] for n_comp in lda_components: codes.append(np.ones((end_idx-start_idx,n_comp)) * np.nan) else: codes.append(np.ones((end_idx-start_idx, K)) * np.nan) for i, tid in enumerate(track_ids[start_idx:end_idx]): if origcodes is None: path = utils.path_from_tid(maindir, tid) feats = utils.extract_feats(path) if feats == None: continue code = np.median(fx(feats), axis=0) else: code = origcodes[i] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() if pca is not None: code = pca.transform(code) if lda is not None: for lda_idx, n_comp in enumerate(lda_components): tmp = lda[lda_idx].transform(code) codes[lda_idx][i] = dan_tools.chromnorm(tmp.reshape(tmp.shape[0], 1)).squeeze() else: codes[0][i] = code if i % 1000 == 0: logger.info("Computed %d of %d track(s)" % (i, end_idx-start_idx)) res = (codes, track_ids[start_idx:end_idx], clique_ids[start_idx:end_idx]) return res
def main(): # Args parser parser = argparse.ArgumentParser( description="Evaluates the 500 binary queries from the SHS data set", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-lda", action="store", nargs=2, default=[None, 0], help="LDA file and version", metavar=('lda.pkl', 'n')) parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=("", 0), help="pca model saved in a pickle file, " \ "use n dimensions") # Parse args = parser.parse_args() # Track time start_time = time.time() maindir = args.msd_dir queriesf = "SHS/list_500queries.txt" shsf = "SHS/shs_dataset_train.txt" lda = args.lda[0] lda_n = int(args.lda[1]) pcafile = args.pca[0] pcadim = int(args.pca[1]) # sanity cheks utils.assert_file(maindir) utils.assert_file(queriesf) utils.assert_file(shsf) utils.assert_file(pcafile) # read queries queries = read_query_file(queriesf) # load pca trainedpca = None if pcafile != "": f = open(pcafile, 'r') trainedpca = cPickle.load(f) f.close() assert pcadim > 0 logger.info('trained pca loaded') # load lda if lda != None: lda = utils.load_pickle(lda) # to keep stats results = [] # iterate over queries logger.info("Starting the binary task...") # Get the dictionary transform td = load_transform(args.dictfile) for triplet in queries: # get features filenames = map(lambda tid: utils.path_from_tid(maindir, tid), triplet) triplet_feats = map( lambda f: extract_feats(f, td=td, lda_file=lda, lda_n=lda_n), filenames) if None in triplet_feats: continue # Apply pca if needed if trainedpca: triplet_feats = map(lambda feat: \ trainedpca.apply_newdata(feat, ndims=pcadim), triplet_feats) assert triplet_feats[np.random.randint(3)].shape[0] == pcadim # Compute result res1 = triplet_feats[0] - triplet_feats[1] res1 = np.sum(res1 * res1) res2 = triplet_feats[0] - triplet_feats[2] res2 = np.sum(res2 * res2) if res1 < res2: results.append(1) else: results.append(0) # verbose if len(results) % 5 == 0: logger.info(' --- after %d queries, accuracy: %.1f %%' % \ (len(results), 100. * np.mean(results))) # done logger.info('After %d queries, accuracy: %.1f %%' % (len(results), 100. * np.mean(results))) logger.info('Done! Took %.2f seconds' % (time.time() - start_time))
def fit_LDA_filter(maindir, d, codes_f, N=9000, n=9, pca=None, pca_n=0, norm=False, outlda="LDAs.pk", lda_components=[50,100,200]): """Fits an LDA with a filtered version of the dataset, such that each clique contains at least n tracks.""" import cover_id_test as CO clique_test = load_pickle("SHS/clique_ids_test.pk") clique_train = load_pickle("SHS/clique_ids_train.pk") track_test = load_pickle("SHS/track_ids_test.pk") track_train = load_pickle("SHS/track_ids_train.pk") # Result to codes = [] labels = [] if pca is not None: P = load_pickle(pca) P = P[pca_n] C = CO.load_codes(codes_f, -1, 30) C = C[0] # Load the codes from the training set codestrain = load_pickle("results/codes-shs-train-k2045.pk") clique_idx = 0 label_id = 1000001 td = load_transform(d) while len(codes) < N: # Pick the tracks from the train set that belong to a # clique that has at least n tracks if clique_idx < len(clique_train): while clique_idx < len(clique_train) and \ len(np.where(clique_train == clique_train[clique_idx])[0]) < n : clique_idx += 1 if clique_idx < len(clique_train) and clique_train[clique_idx] != -2: for clique_id in \ np.where(clique_train == clique_train[clique_idx])[0]: code = codestrain[clique_id] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() clique_train[clique_id] = -2 if code is None: continue if pca is not None: code = P.transform(code) codes.append( code ) labels.append( clique_idx ) clique_idx += 1 # Pick random tracks from the MSD and assign new labels else: clique_id = np.random.random_integers(0, len(C)-1) while np.any(np.equal(C[clique_id], None)) or clique_test[clique_id] == -2: clique_id = np.random.random_integers(0, len(C)-1) code = C[clique_id] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() if pca is not None: code = P.transform(code) codes.append( code ) labels.append( label_id ) label_id += 1 clique_test[clique_id] = -2 print "Computed %d out of %d codes" % (len(codes), N) codes_pk = "codes_filter_LDA_PCA.pk" cliques_pk = "cliques_filter_LDA_PCA.pk" save_pickle(codes, codes_pk) save_pickle(labels, cliques_pk) time.sleep(3) # fit LDA and save model fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, ver=True, pca="", pca_n=0): """Computes the features using the dictionary d. If it doesn't exist, computes them using Thierry's method. The improved pipeline is composed of 11 steps: 1.- Beat Synchronous Chroma 2.- L2-Norm 3.- Shingle (PATCH_LEN: 75 x 12) 4.- 2D-FFT 5.- L2-Norm 6.- Log-Scale 7.- Sparse Coding 8.- Shrinkage 9.- Median Aggregation 10.- Dimensionality Reduction 11.- L2-Norm Original method by Thierry doesn't include steps 5,6,7,8,11. """ if d != "": fx = load_transform(d) K = int(d.split("_")[1].split("E")[1]) else: K = PATCH_LEN if codes is None: compute_codes = True codes = np.ones((len(track_ids), K)) * np.nan else: compute_codes = False K = codes[0].shape[0] if lda_file is not None: if lda_n == 0: n_comp = 50 elif lda_n == 1: n_comp = 100 elif lda_n == 2: n_comp = 200 else: n_comp = K if pca != "": pca = utils.load_pickle(pca) pca = pca[pca_n] final_feats = np.ones((codes.shape[0], n_comp)) * np.nan orig_feats = [] for cnt, tid in enumerate(track_ids): if compute_codes: path = utils.path_from_tid(maindir, tid) # 1.- Beat Synchronous Chroma # 2.- L2-Norm # 3.- Shingle (PATCH_LEN: 75 x 12) # 4.- 2D-FFT feats = utils.extract_feats(path) #orig_feats.append(feats) # Store orig feats if feats == None: continue if d != "": # 5.- L2-Norm # 6.- Log-Scale # 7.- Sparse Coding # 8.- Shrinkage H = fx(feats) else: H = feats #. 9.- Median Aggregation H = np.median(H, axis=0) else: H = codes[cnt] if compute_codes: codes[cnt] = H.copy() if pca != "": H = pca.transform(H) # Apply LDA if needed if lda_file is not None: #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() # 10.- Dimensionality Reduction H = lda_file[lda_n].transform(H) # 11.- L2-Norm final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() if ver: if cnt % 50 == 1: logger.info("----Computing features %.1f%%" % \ (cnt/float(len(track_ids)) * 100)) if d == "": d = "orig" # For saving purposes # Save codes utils.create_dir("results") if compute_codes: utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk") # Save features #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk") logger.info("Features Computed") return final_feats
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, ver=True, pca="", pca_n=0): """Computes the features using the dictionary d. If it doesn't exist, computes them using Thierry's method. The improved pipeline is composed of 11 steps: 1.- Beat Synchronous Chroma 2.- L2-Norm 3.- Shingle (PATCH_LEN: 75 x 12) 4.- 2D-FFT 5.- L2-Norm 6.- Log-Scale 7.- Sparse Coding 8.- Shrinkage 9.- Median Aggregation 10.- Dimensionality Reduction 11.- L2-Norm Original method by Thierry doesn't include steps 5,6,7,8,11. """ if d != "": fx = load_transform(d) K = int(d.split("_")[1].split("E")[1]) else: K = PATCH_LEN if codes is None: compute_codes = True codes = np.ones((len(track_ids),K)) * np.nan else: compute_codes = False K = codes[0].shape[0] if lda_file is not None: if lda_n == 0: n_comp = 50 elif lda_n == 1: n_comp = 100 elif lda_n == 2: n_comp = 200 else: n_comp = K if pca != "": pca = utils.load_pickle(pca) pca = pca[pca_n] final_feats = np.ones((codes.shape[0],n_comp)) * np.nan orig_feats = [] for cnt, tid in enumerate(track_ids): if compute_codes: path = utils.path_from_tid(maindir, tid) # 1.- Beat Synchronous Chroma # 2.- L2-Norm # 3.- Shingle (PATCH_LEN: 75 x 12) # 4.- 2D-FFT feats = utils.extract_feats(path) #orig_feats.append(feats) # Store orig feats if feats == None: continue if d != "": # 5.- L2-Norm # 6.- Log-Scale # 7.- Sparse Coding # 8.- Shrinkage H = fx(feats) else: H = feats #. 9.- Median Aggregation H = np.median(H, axis=0) else: H = codes[cnt] if compute_codes: codes[cnt] = H.copy() if pca != "": H = pca.transform(H) # Apply LDA if needed if lda_file is not None: #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() # 10.- Dimensionality Reduction H = lda_file[lda_n].transform(H) # 11.- L2-Norm final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() if ver: if cnt % 50 == 1: logger.info("----Computing features %.1f%%" % \ (cnt/float(len(track_ids)) * 100)) if d == "": d = "orig" # For saving purposes # Save codes utils.create_dir("results") if compute_codes: utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk") # Save features #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk") logger.info("Features Computed") return final_feats
def fit_LDA_filter(maindir, d, codes_f, N=9000, n=9, pca=None, pca_n=0, norm=False, outlda="LDAs.pk", lda_components=[50, 100, 200]): """Fits an LDA with a filtered version of the dataset, such that each clique contains at least n tracks.""" import cover_id_test as CO clique_test = load_pickle("SHS/clique_ids_test.pk") clique_train = load_pickle("SHS/clique_ids_train.pk") track_test = load_pickle("SHS/track_ids_test.pk") track_train = load_pickle("SHS/track_ids_train.pk") # Result to codes = [] labels = [] if pca is not None: P = load_pickle(pca) P = P[pca_n] C = CO.load_codes(codes_f, -1, 30) C = C[0] # Load the codes from the training set codestrain = load_pickle("results/codes-shs-train-k2045.pk") clique_idx = 0 label_id = 1000001 td = load_transform(d) while len(codes) < N: # Pick the tracks from the train set that belong to a # clique that has at least n tracks if clique_idx < len(clique_train): while clique_idx < len(clique_train) and \ len(np.where(clique_train == clique_train[clique_idx])[0]) < n : clique_idx += 1 if clique_idx < len( clique_train) and clique_train[clique_idx] != -2: for clique_id in \ np.where(clique_train == clique_train[clique_idx])[0]: code = codestrain[clique_id] if norm: code = dan_tools.chromnorm( code.reshape(code.shape[0], 1)).squeeze() clique_train[clique_id] = -2 if code is None: continue if pca is not None: code = P.transform(code) codes.append(code) labels.append(clique_idx) clique_idx += 1 # Pick random tracks from the MSD and assign new labels else: clique_id = np.random.random_integers(0, len(C) - 1) while np.any(np.equal(C[clique_id], None)) or clique_test[clique_id] == -2: clique_id = np.random.random_integers(0, len(C) - 1) code = C[clique_id] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() if pca is not None: code = P.transform(code) codes.append(code) labels.append(label_id) label_id += 1 clique_test[clique_id] = -2 print "Computed %d out of %d codes" % (len(codes), N) codes_pk = "codes_filter_LDA_PCA.pk" cliques_pk = "cliques_filter_LDA_PCA.pk" save_pickle(codes, codes_pk) save_pickle(labels, cliques_pk) time.sleep(3) # fit LDA and save model fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)
def main(): # Args parser parser = argparse.ArgumentParser( description="Evaluates the 500 binary queries from the SHS data set", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument( "-lda", action="store", nargs=2, default=[None, 0], help="LDA file and version", metavar=("lda.pkl", "n") ) parser.add_argument( "-pca", nargs=2, metavar=("f.pkl", "n"), default=("", 0), help="pca model saved in a pickle file, " "use n dimensions", ) # Parse args = parser.parse_args() # Track time start_time = time.time() maindir = args.msd_dir queriesf = "SHS/list_500queries.txt" shsf = "SHS/shs_dataset_train.txt" lda = args.lda[0] lda_n = int(args.lda[1]) pcafile = args.pca[0] pcadim = int(args.pca[1]) # sanity cheks utils.assert_file(maindir) utils.assert_file(queriesf) utils.assert_file(shsf) utils.assert_file(pcafile) # read queries queries = read_query_file(queriesf) # load pca trainedpca = None if pcafile != "": f = open(pcafile, "r") trainedpca = cPickle.load(f) f.close() assert pcadim > 0 logger.info("trained pca loaded") # load lda if lda != None: lda = utils.load_pickle(lda) # to keep stats results = [] # iterate over queries logger.info("Starting the binary task...") # Get the dictionary transform td = load_transform(args.dictfile) for triplet in queries: # get features filenames = map(lambda tid: utils.path_from_tid(maindir, tid), triplet) triplet_feats = map(lambda f: extract_feats(f, td=td, lda_file=lda, lda_n=lda_n), filenames) if None in triplet_feats: continue # Apply pca if needed if trainedpca: triplet_feats = map(lambda feat: trainedpca.apply_newdata(feat, ndims=pcadim), triplet_feats) assert triplet_feats[np.random.randint(3)].shape[0] == pcadim # Compute result res1 = triplet_feats[0] - triplet_feats[1] res1 = np.sum(res1 * res1) res2 = triplet_feats[0] - triplet_feats[2] res2 = np.sum(res2 * res2) if res1 < res2: results.append(1) else: results.append(0) # verbose if len(results) % 5 == 0: logger.info(" --- after %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results))) # done logger.info("After %d queries, accuracy: %.1f %%" % (len(results), 100.0 * np.mean(results))) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))