def process_filelist_test(filelist=None, model=None, tmpfilename=None, npicks=None, winsize=None, finaldim=None, K=1, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is in testartist) INPUT filelist - a list of song files model - h5 file containing feats and year for all train songs tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep K - param of KNN (default 1) typecompress - feature type, 'picks', 'corrcoeff' or 'cov' must be the same as in training """ # sanity check for arg in locals().values(): assert not arg is None, 'process_filelist_test, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file', tmpfilename, 'already exists.' return if not os.path.isfile(model): print 'ERROR: model', model, 'does not exist.' return # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[ 1] == finaldim, 'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION') output.createEArray(group, 'year_real', tables.IntAtom(shape=()), (0, ), '', expectedrows=len(filelist)) output.createEArray(group, 'year_pred', tables.Float64Atom(shape=()), (0, ), '', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False, 'Unknown type of compression: ' + str(typecompress) # go through files cnt_f = 0 for f in filelist: cnt_f += 1 if cnt_f % 5000 == 0: print 'TESTING FILE #' + str(cnt_f) # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0: # probably useless but... continue if typecompress == 'picks': # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre, npicks, winsize, finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres, finaldim, randproj=randproj) else: assert False, 'Unknown type of compression: ' + str(typecompress) if processed_feats is None: continue if processed_feats.shape[0] == 0: continue # do prediction year_pred = do_prediction(processed_feats, kd, h5model, K) # add pred and ground truth to output if not year_pred is None: output.root.data.year_real.append([year]) output.root.data.year_pred.append([year_pred]) # close output and model del kd h5model.close() output.close() # done return
def process_filelist_train(filelist=None, testartists=None, tmpfilename=None, npicks=None, winsize=None, finaldim=None, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is not in testartist) INPUT filelist - a list of song files testartists - set of artist ID that we should not use tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients), 'cov' (covariance) """ # sanity check for arg in locals().values(): assert not arg is None, 'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file', tmpfilename, 'already exists.' return # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/", 'data', 'TMP FILE FOR YEAR RECOGNITION') output.createEArray(group, 'feats', tables.Float64Atom(shape=()), (0, finaldim), '', expectedrows=len(filelist)) output.createEArray(group, 'year', tables.IntAtom(shape=()), (0, ), '', expectedrows=len(filelist)) output.createEArray(group, 'track_id', tables.StringAtom(18, shape=()), (0, ), '', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False, 'Unknown type of compression: ' + str(typecompress) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #', cnt_f # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0 or artist_id in testartists: continue # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if typecompress == 'picks': if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre, npicks, winsize, finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres, finaldim, randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres, finaldim, randproj=randproj) else: assert False, 'Unknown type of compression: ' + str(typecompress) # save them to tmp file n_p_feats = processed_feats.shape[0] output.root.data.year.append(np.array([year] * n_p_feats)) output.root.data.track_id.append(np.array([track_id] * n_p_feats)) output.root.data.feats.append(processed_feats) # we're done, close output output.close() return
def process_filelist_train(filelist=None,testartists=None,tmpfilename=None, npicks=None,winsize=None,finaldim=None,typecompress='picks'): """ Main function, process all files in the list (as long as their artist is not in testartist) INPUT filelist - a list of song files testartists - set of artist ID that we should not use tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep typecompress - one of 'picks' (win of btchroma), 'corrcoef' (correlation coefficients), 'cov' (covariance) """ # sanity check for arg in locals().values(): assert not arg is None,'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file',tmpfilename,'already exists.' return # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION') output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,finaldim),'', expectedrows=len(filelist)) output.createEArray(group,'year',tables.IntAtom(shape=()),(0,),'', expectedrows=len(filelist)) output.createEArray(group,'track_id',tables.StringAtom(18,shape=()),(0,),'', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress == 'cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False,'Unknown type of compression: '+str(typecompress) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #',cnt_f # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0 or artist_id in testartists: continue # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if typecompress == 'picks': if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj) else: assert False,'Unknown type of compression: '+str(typecompress) # save them to tmp file n_p_feats = processed_feats.shape[0] output.root.data.year.append( np.array( [year] * n_p_feats ) ) output.root.data.track_id.append( np.array( [track_id] * n_p_feats ) ) output.root.data.feats.append( processed_feats ) # we're done, close output output.close() return
def process_filelist_test(filelist=None,model=None,tmpfilename=None, npicks=None,winsize=None,finaldim=None,K=1, typecompress='picks'): """ Main function, process all files in the list (as long as their artist is in testartist) INPUT filelist - a list of song files model - h5 file containing feats and year for all train songs tmpfilename - where to save our processed features npicks - number of segments to pick per song winsize - size of each segment we pick finaldim - how many values do we keep K - param of KNN (default 1) typecompress - feature type, 'picks', 'corrcoeff' or 'cov' must be the same as in training """ # sanity check for arg in locals().values(): assert not arg is None,'process_filelist_test, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file',tmpfilename,'already exists.' return if not os.path.isfile(model): print 'ERROR: model',model,'does not exist.' return # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/",'data','TMP FILE FOR YEAR RECOGNITION') output.createEArray(group,'year_real',tables.IntAtom(shape=()),(0,),'', expectedrows=len(filelist)) output.createEArray(group,'year_pred',tables.Float64Atom(shape=()),(0,),'', expectedrows=len(filelist)) # random projection ndim = 12 # fixed in this dataset if typecompress == 'picks': randproj = RANDPROJ.proj_point5(ndim * winsize, finaldim) elif typecompress == 'corrcoeff' or typecompress=='cov': randproj = RANDPROJ.proj_point5(ndim * ndim, finaldim) elif typecompress == 'avgcov': randproj = RANDPROJ.proj_point5(90, finaldim) else: assert False,'Unknown type of compression: '+str(typecompress) # go through files cnt_f = 0 for f in filelist: cnt_f += 1 if cnt_f % 5000 == 0: print 'TESTING FILE #'+str(cnt_f) # check file h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) year = GETTERS.get_year(h5) track_id = GETTERS.get_track_id(h5) h5.close() if year <= 0: # probably useless but... continue if typecompress == 'picks': # we have a train artist with a song year, we're good bttimbre = get_bttimbre(f) if bttimbre is None: continue # we even have normal features, awesome! processed_feats = CBTF.extract_and_compress(bttimbre,npicks,winsize,finaldim, randproj=randproj) elif typecompress == 'corrcoeff': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.corr_and_compress(timbres,finaldim,randproj=randproj) elif typecompress == 'cov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.cov_and_compress(timbres,finaldim,randproj=randproj) elif typecompress == 'avgcov': h5 = GETTERS.open_h5_file_read(f) timbres = GETTERS.get_segments_timbre(h5).T h5.close() processed_feats = CBTF.avgcov_and_compress(timbres,finaldim,randproj=randproj) else: assert False,'Unknown type of compression: '+str(typecompress) if processed_feats is None: continue if processed_feats.shape[0] == 0: continue # do prediction year_pred = do_prediction(processed_feats,kd,h5model,K) # add pred and ground truth to output if not year_pred is None: output.root.data.year_real.append( [year] ) output.root.data.year_pred.append( [year_pred] ) # close output and model del kd h5model.close() output.close() # done return