def score(feats, clique_ids, lda_idx=0, stats_len=None, ver=True): """Compute the scores of the entire train dataset.""" if stats_len is None: stats = [np.inf] * len(feats) else: stats = [np.inf] * stats_len # For each track id that has a clique id q = 0 for i, clique_id in enumerate(clique_ids): if clique_id == -1: continue D = distance.cdist(feats[i][np.newaxis, :], feats, metric="euclidean") s = np.argsort(D)[0] sorted_cliques = clique_ids[s] r = np.argwhere(sorted_cliques == clique_id)[1:] if len(r) > 0: stats[i] = r q += 1 if ver: if q % 400 == 0: logger.info('After %d queries: average rank per track: %.2f, '\ 'clique: %.2f, MAP: %.2f%%' \ % (q, anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) return stats
def score(feats, clique_ids, lda_idx=0, stats_len=None, ver=True): """Compute the scores of the entire train dataset.""" if stats_len is None: stats = [np.inf]*len(feats) else: stats = [np.inf]*stats_len # For each track id that has a clique id q = 0 for i, clique_id in enumerate(clique_ids): if clique_id == -1: continue D = distance.cdist(feats[i][np.newaxis,:], feats, metric="euclidean") s = np.argsort(D)[0] sorted_cliques = clique_ids[s] r = np.argwhere( sorted_cliques == clique_id )[1:] if len(r) > 0: stats[i] = r q += 1 if ver: if q % 400 == 0: logger.info('After %d queries: average rank per track: %.2f, '\ 'clique: %.2f, MAP: %.2f%%' \ % (q, anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) return stats
def score(feats, clique_ids, N=5236, lda_idx=0): stats = [np.inf] * N # For each track id that has a clique id logger.info("Computing scores for the MSD...") q = 0 for i, clique_id in enumerate(clique_ids): if clique_id == -1: continue D = distance.cdist(feats[i][np.newaxis,:], feats, metric="euclidean") s = np.argsort(D)[0] sorted_cliques = clique_ids[s] r = np.argwhere( sorted_cliques == clique_id )[1:] if len(r) > 0: stats[q] = r q += 1 if q % 400 == 0: logger.info('After %d queries: average rank per track: %.2f' ', clique: %.2f, MAP: %.2f%%' \ % (q, anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) return stats
def score(feats, clique_ids, N=5236, lda_idx=0): stats = [np.inf] * N # For each track id that has a clique id logger.info("Computing scores for the MSD...") q = 0 for i, clique_id in enumerate(clique_ids): if clique_id == -1: continue D = distance.cdist(feats[i][np.newaxis, :], feats, metric="euclidean") s = np.argsort(D)[0] sorted_cliques = clique_ids[s] r = np.argwhere(sorted_cliques == clique_id)[1:] if len(r) > 0: stats[q] = r q += 1 if q % 400 == 0: logger.info('After %d queries: average rank per track: %.2f' ', clique: %.2f, MAP: %.2f%%' \ % (q, anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) return stats
def main(): # Args parser parser = argparse.ArgumentParser(description= "Evaluates the average rank and mean AP for the test SHS " \ "over the entire MSD", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-outdir", action="store", default="msd_codes", help="Output directory for the features") parser.add_argument("-N", action="store", default=10, type=int, help="Number of processors to use when computing " \ "the codes for 1M tracks,") parser.add_argument("-lda", action="store", default=None, help="LDA file") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=(None, 0), help="pca model saved in a pickle file, " \ "use n dimensions") parser.add_argument("-codes", action="store", nargs=2, default=[None,0], dest="codesdir", metavar=("msd_codes/", "n"), help="Path to the folder with all the codes and " "version to evaluate") parser.add_argument("-orig_codes", action="store", default=None, dest="origcodesdir", help="Path to the folder with all the codes without " "dimensionality reduction") parser.add_argument("-norm", action="store_true", dest="norm", default=False, help="Normalize before LDA/PCA or not") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_test.txt" global lda global pca # sanity cheks utils.assert_file(maindir) utils.assert_file(shsf) utils.create_dir(args.outdir) # read cliques and all tracks cliques, all_tracks = utils.read_shs_file(shsf) track_ids = utils.load_pickle("SHS/track_ids_test.pk") clique_ids = utils.load_pickle("SHS/clique_ids_test.pk") # read codes file codesdir = args.codesdir[0] if codesdir is not None: if os.path.isfile(codesdir): c = utils.load_pickle(codesdir) feats = c[0] track_ids = c[1] clique_ids = c[2] else: feats, track_ids, clique_ids = load_codes(codesdir, lda_idx=int(args.codesdir[1])) logger.info("Codes files read") print feats.shape else: # Read PCA file if args.pca[0] is not None: pca = utils.load_pickle(args.pca[0])[int(args.pca[1])] # read LDA file lda_file = args.lda if lda_file is not None: lda = utils.load_pickle(lda_file) utils.assert_file(args.dictfile) # Prepare Multiprocessing computation input = [] pool = Pool(processes=args.N) for n in xrange(args.N): arg = {} arg["track_ids"] = track_ids arg["maindir"] = maindir arg["d"] = args.dictfile arg["N"] = n arg["clique_ids"] = clique_ids arg["outdir"] = args.outdir arg["origcodesdir"] = args.origcodesdir arg["pca_n"] = int(args.pca[1]) arg["norm"] = args.norm input.append(arg) # Start computing the codes pool.map(compute_codes, input) # Done! logger.info("Codes computation done!") logger.info("Took %.2f seconds" % (time.time() - start_time)) sys.exit() # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids, N=len(all_tracks)) # TODO: change file name utils.save_pickle(stats, "stats.pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def eval_models(outfile="results/eval_models.txt"): # Read PCA files basedir = "/Volumes/Audio/LargeScaleCoverID/models/" pca_files = glob.glob(os.path.join(basedir, "PCAs_*")) pca_files.append(None) # To not use PCA # PCA dimensions per file pca_dims = np.asarray([50, 100, 200, 500, 1000]) # Read LDA files lda_files = glob.glob(os.path.join(basedir, "LDAs_*")) # LDA dimensions per file lda_dims = np.asarray([50, 100, 200]) # Original codes with k=2045 origcodes = "msd_codes_k2045" k = 2045 d = "models/BasisProjection2_kE2045_actEdot_shkE0x200_anormETrue.pk" # Python script to compute codes and scores covertest = "./cover_id_test.py" # Eval all combinations of LDA/PCA for lda_file in lda_files: print "Computing", lda_file pca_dim = get_param_from_filename(lda_file, "pcaE", "int") # Get parameters for saving results if pca_dim is None: m = -1 else: pca_dim = get_param_from_filename(lda_file, "pcaE", "int") m = get_param_from_filename(lda_file, "mE", "int") pca_idx = np.argwhere(pca_dims == pca_dim)[0][0] n = get_param_from_filename(lda_file, "nE", "int") N = get_param_from_filename(lda_file, "NE", "int") norm = get_param_from_filename(lda_file, "normE", "bool") if norm: norm = "-norm" pca_file = pca_files[1] else: norm = "" pca_file = pca_files[0] # Set up codes computation outdir = "msd_codes_" + os.path.basename(lda_file).strip(".pk") if pca_dim is None: cmd = "%s -orig_codes %s -lda %s -outdir " \ "%s -dictfile %s %s MSD" \ % (covertest, origcodes, lda_file, outdir, d, norm) else: cmd = "%s -orig_codes %s -lda %s -outdir " \ "%s -dictfile %s -pca %s %d %s MSD" \ % (covertest, origcodes, lda_file, outdir, d, pca_file, pca_idx, norm) print cmd # Compute codes subprocess.call(cmd.split()) # Compute scores for lda_dim in lda_dims: lda_idx = np.argwhere(lda_dims == lda_dim)[0][0] cmd = "%s -codes %s %d MSD" \ % (covertest, outdir, lda_idx) print cmd subprocess.call(cmd.split(" ")) # Store temp results stats = load_pickle("stats.pk") # Get results AR = anst.average_rank_per_track(stats) MAP = anst.mean_average_precision(stats) * 100 Pk1 = anst.average_precision_at_k(stats, 1) * 100 Pk10 = anst.average_precision_at_k(stats, 10) * 100 Pk100 = anst.average_precision_at_k(stats, 100) * 100 # Store final results f = open(outfile, "a") result = "%d\t%r\t%d\t%d\t%d\t%d\t%r\t" % \ (k, pca_dim, m, lda_dim, n, N, norm) result += "%d\t%.2f\t%.2f\t%.2f\t%.2f\n" % \ (AR, MAP, Pk1, Pk10, Pk100) f.write(result) f.close()
def main(): # Args parser parser = argparse.ArgumentParser( description="Cover song ID on the training Second Hand Song dataset", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-lda", action="store", nargs=2, default=[None, 0], help="LDA file and version", metavar=('lda.pkl', 'n')) parser.add_argument("-codes", action="store", default=None, dest="codesfile", help="Pickle to the features file") parser.add_argument("-f", action="store", default="", dest="featfile", help="Pickle to the final features") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=("", 0), help="pca model saved in a pickle file, " \ "use n dimensions") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_train.txt" dictfile = args.dictfile # sanity cheks utils.assert_file(dictfile) utils.assert_file(maindir) utils.assert_file(shsf) # read clique ids and track ids cliques, all_tracks = utils.read_shs_file(shsf) track_ids = all_tracks.keys() clique_ids = np.asarray(utils.compute_clique_idxs(track_ids, cliques)) logger.info("Track ids and clique ids read") utils.save_pickle(clique_ids, "SHS/clique_ids_train.pk") utils.save_pickle(track_ids, "SHS/track_ids_train.pk") # read LDA file lda_file = args.lda[0] if lda_file != None: lda_file = utils.load_pickle(lda_file) logger.info("LDA file read") # read codes file codesfile = args.codesfile if codesfile != None: codesfile = utils.load_pickle(codesfile) logger.info("Codes file read") # Compute features if needed if args.featfile == "": feats = compute_feats(track_ids, maindir, dictfile, lda_file=lda_file, lda_n=int(args.lda[1]), codes=codesfile, pca=args.pca[0], pca_n=int(args.pca[1])) else: feats = utils.load_pickle(args.featfile) # Apply PCA pcafile = args.pca[0] pcadim = int(args.pca[1]) if pcafile != "" and False: trainedpca = utils.load_pickle(pcafile) assert pcadim > 0 logger.info('trained pca loaded') pcafeats = np.zeros((feats.shape[0], pcadim)) for i, feat in enumerate(feats): pcafeats[i] = trainedpca.apply_newdata(feat, ndims=pcadim) feats = pcafeats # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids) # Save data if dictfile == "": dictfile = "thierry" # For saving purposes utils.save_pickle(stats, "results/stats-" + os.path.basename(dictfile) + ".pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main(): # Args parser parser = argparse.ArgumentParser(description= "Cover song ID on the training Second Hand Song dataset", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-lda", action="store", nargs=2, default=[None,0], help="LDA file and version", metavar=('lda.pkl', 'n')) parser.add_argument("-codes", action="store", default=None, dest="codesfile", help="Pickle to the features file") parser.add_argument("-f", action="store", default="", dest="featfile", help="Pickle to the final features") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=("", 0), help="pca model saved in a pickle file, " \ "use n dimensions") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_train.txt" dictfile = args.dictfile # sanity cheks utils.assert_file(dictfile) utils.assert_file(maindir) utils.assert_file(shsf) # read clique ids and track ids cliques, all_tracks = utils.read_shs_file(shsf) track_ids = all_tracks.keys() clique_ids = np.asarray(utils.compute_clique_idxs(track_ids, cliques)) logger.info("Track ids and clique ids read") utils.save_pickle(clique_ids, "SHS/clique_ids_train.pk") utils.save_pickle(track_ids, "SHS/track_ids_train.pk") # read LDA file lda_file = args.lda[0] if lda_file != None: lda_file = utils.load_pickle(lda_file) logger.info("LDA file read") # read codes file codesfile = args.codesfile if codesfile != None: codesfile = utils.load_pickle(codesfile) logger.info("Codes file read") # Compute features if needed if args.featfile == "": feats = compute_feats(track_ids, maindir, dictfile, lda_file=lda_file, lda_n=int(args.lda[1]), codes=codesfile, pca=args.pca[0], pca_n=int(args.pca[1])) else: feats = utils.load_pickle(args.featfile) # Apply PCA pcafile = args.pca[0] pcadim = int(args.pca[1]) if pcafile != "" and False: trainedpca = utils.load_pickle(pcafile) assert pcadim > 0 logger.info('trained pca loaded') pcafeats = np.zeros((feats.shape[0], pcadim)) for i,feat in enumerate(feats): pcafeats[i] = trainedpca.apply_newdata(feat, ndims=pcadim) feats = pcafeats # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids) # Save data if dictfile == "": dictfile = "thierry" # For saving purposes utils.save_pickle(stats, "results/stats-" + os.path.basename(dictfile) + ".pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main(): # Args parser parser = argparse.ArgumentParser(description= "Evaluates the average rank and mean AP for the test SHS " \ "over the entire MSD", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-outdir", action="store", default="msd_codes", help="Output directory for the features") parser.add_argument("-N", action="store", default=10, type=int, help="Number of processors to use when computing " \ "the codes for 1M tracks,") parser.add_argument("-lda", action="store", default=None, help="LDA file") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=(None, 0), help="pca model saved in a pickle file, " \ "use n dimensions") parser.add_argument("-codes", action="store", nargs=2, default=[None, 0], dest="codesdir", metavar=("msd_codes/", "n"), help="Path to the folder with all the codes and " "version to evaluate") parser.add_argument("-orig_codes", action="store", default=None, dest="origcodesdir", help="Path to the folder with all the codes without " "dimensionality reduction") parser.add_argument("-norm", action="store_true", dest="norm", default=False, help="Normalize before LDA/PCA or not") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_test.txt" global lda global pca # sanity cheks utils.assert_file(maindir) utils.assert_file(shsf) utils.create_dir(args.outdir) # read cliques and all tracks cliques, all_tracks = utils.read_shs_file(shsf) track_ids = utils.load_pickle("SHS/track_ids_test.pk") clique_ids = utils.load_pickle("SHS/clique_ids_test.pk") # read codes file codesdir = args.codesdir[0] if codesdir is not None: if os.path.isfile(codesdir): c = utils.load_pickle(codesdir) feats = c[0] track_ids = c[1] clique_ids = c[2] else: feats, track_ids, clique_ids = load_codes(codesdir, lda_idx=int( args.codesdir[1])) logger.info("Codes files read") print feats.shape else: # Read PCA file if args.pca[0] is not None: pca = utils.load_pickle(args.pca[0])[int(args.pca[1])] # read LDA file lda_file = args.lda if lda_file is not None: lda = utils.load_pickle(lda_file) utils.assert_file(args.dictfile) # Prepare Multiprocessing computation input = [] pool = Pool(processes=args.N) for n in xrange(args.N): arg = {} arg["track_ids"] = track_ids arg["maindir"] = maindir arg["d"] = args.dictfile arg["N"] = n arg["clique_ids"] = clique_ids arg["outdir"] = args.outdir arg["origcodesdir"] = args.origcodesdir arg["pca_n"] = int(args.pca[1]) arg["norm"] = args.norm input.append(arg) # Start computing the codes pool.map(compute_codes, input) # Done! logger.info("Codes computation done!") logger.info("Took %.2f seconds" % (time.time() - start_time)) sys.exit() # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids, N=len(all_tracks)) # TODO: change file name utils.save_pickle(stats, "stats.pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))