def __init__(self): """ Main routine for the command-line interface to audfprint """ # Other globals set from command line args = config_dict # values = ["--dbase", "fpdbase.pklz"] # args = docopt.docopt(USAGE, version=__version__, argv=values) analyzer = setup_analyzer(args) precomp_type = 'hashes' # For everything other than precompute, we need a database name # Check we have one dbasename = args['--dbase'] # Load existing hash table file (add, match, merge) hash_tab = hash_table.HashTable(dbasename) if analyzer and 'samplerate' in hash_tab.params \ and hash_tab.params['samplerate'] != analyzer.target_sr: # analyzer.target_sr = hash_tab.params['samplerate'] print("db samplerate overridden to ", analyzer.target_sr) # Create a matcher matcher = setup_matcher(args) self.args = args self.analyzer = analyzer self.matcher = matcher self.hash_table = hash_tab self.precomp_type = precomp_type
def local_tester(): test_fn = '/Users/dpwe/Downloads/carol11k.wav' test_ht = hash_table.HashTable() test_analyzer = Analyzer() test_analyzer.ingest(test_ht, test_fn) test_ht.save('httest.pklz')
def make_ht_from_list(analyzer, filelist, hashbits, depth, maxtime, pipe=None): """ Populate a hash table from a list, used as target for multiprocess division. pipe is a pipe over which to push back the result, else return it """ # Create new ht instance ht = hash_table.HashTable(hashbits=hashbits, depth=depth, maxtime=maxtime) # Add in the files for filename in filelist: hashes = analyzer.wavfile2hashes(filename) ht.store(filename, hashes) # Pass back to caller if pipe: pipe.send(ht) else: return ht
def glob2hashtable(pattern, density=20.0): """ Build a hash table from the files matching a glob pattern """ global g2h_analyzer if g2h_analyzer == None: g2h_analyzer = Analyzer(density=density) ht = hash_table.HashTable() filelist = glob.glob(pattern) initticks = time.clock() totdur = 0.0 tothashes = 0 for ix, file_ in enumerate(filelist): print(time.ctime(), "ingesting #", ix, ":", file_, "...") dur, nhash = g2h_analyzer.ingest(ht, file_) totdur += dur tothashes += nhash elapsedtime = time.clock() - initticks print("Added", tothashes, "(", tothashes / float(totdur), "hashes/sec) at ", elapsedtime / totdur, "x RT") return ht
def main(argv): """ Main routine for the command-line interface to audfprint """ # Other globals set from command line args = docopt.docopt(USAGE, version=__version__, argv=argv[1:]) # Figure which command was chosen poss_cmds = [ 'new', 'add', 'precompute', 'merge', 'newmerge', 'match', 'list', 'remove' ] cmdlist = [cmdname for cmdname in poss_cmds if args[cmdname]] if len(cmdlist) != 1: raise ValueError("must specify exactly one command") # The actual command as a str cmd = cmdlist[0] # Setup output function report = setup_reporter(args) # Keep track of wall time initticks = time.clock() # Command line sanity. if args["--maxtimebits"]: args["--maxtimebits"] = int(args["--maxtimebits"]) else: args["--maxtimebits"] = hash_table._bitsfor(int(args["--maxtime"])) # Setup the analyzer if we're using one (i.e., unless "merge") analyzer = setup_analyzer(args) if not (cmd is "merge" or cmd is "newmerge" or cmd is "list" or cmd is "remove") else None precomp_type = 'hashes' # Set up the hash table, if we're using one (i.e., unless "precompute") if cmd is not "precompute": # For everything other than precompute, we need a database name # Check we have one dbasename = args['--dbase'] if not dbasename: raise ValueError("dbase name must be provided if not precompute") if cmd == "new" or cmd == "newmerge": # Check that the output directory can be created before we start ensure_dir(os.path.split(dbasename)[0]) # Create a new hash table hash_tab = hash_table.HashTable( hashbits=int(args['--hashbits']), depth=int(args['--bucketsize']), maxtime=(1 << int(args['--maxtimebits']))) # Set its samplerate param if analyzer: hash_tab.params['samplerate'] = analyzer.target_sr else: # Load existing hash table file (add, match, merge) if args['--verbose']: report([time.ctime() + " Reading hash table " + dbasename]) hash_tab = hash_table.HashTable(dbasename) if analyzer and 'samplerate' in hash_tab.params \ and hash_tab.params['samplerate'] != analyzer.target_sr: # analyzer.target_sr = hash_tab.params['samplerate'] print("db samplerate overridden to ", analyzer.target_sr) else: # The command IS precompute # dummy empty hash table hash_tab = None if args['--precompute-peaks']: precomp_type = 'peaks' # Create a matcher matcher = setup_matcher(args) if cmd == 'match' else None filename_iter = filename_list_iterator(args['<file>'], args['--wavdir'], args['--wavext'], args['--list']) ####################### # Run the main commmand ####################### # How many processors to use (multiprocessing) ncores = int(args['--ncores']) if ncores > 1 and not (cmd == "merge" or cmd == "newmerge" or cmd == "list" or cmd == "remove"): # merge/newmerge/list/remove are always single-thread processes do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher, args['--precompdir'], precomp_type, report, skip_existing=args['--skip-existing'], strip_prefix=args['--wavdir'], ncores=ncores) else: do_cmd(cmd, analyzer, hash_tab, filename_iter, matcher, args['--precompdir'], precomp_type, report, skip_existing=args['--skip-existing'], strip_prefix=args['--wavdir']) elapsedtime = time.clock() - initticks if analyzer and analyzer.soundfiletotaldur > 0.: log_format = "Processed {} files ({} s total dur) in {} s sec = {} x RT" print( log_format.format(analyzer.soundfilecount, analyzer.soundfiletotaldur, elapsedtime, (elapsedtime / analyzer.soundfiletotaldur))) # Save the hash table file if it has been modified if hash_tab and hash_tab.dirty: # We already created the directory, if "new". hash_tab.save(dbasename)
def do_cmd(cmd, analyzer, hash_tab, filename_iter, matcher, outdir, type, report, skip_existing=False, strip_prefix=None): """ Breaks out the core part of running the command. This is just the single-core versions. """ if cmd == 'merge' or cmd == 'newmerge': # files are other hash tables, merge them in for filename in filename_iter: hash_tab2 = hash_table.HashTable(filename) if "samplerate" in hash_tab.params: assert hash_tab.params["samplerate"] == hash_tab2.params[ "samplerate"] else: # "newmerge" fails to setup the samplerate param hash_tab.params["samplerate"] = hash_tab2.params["samplerate"] hash_tab.merge(hash_tab2) elif cmd == 'precompute': # just precompute fingerprints, single core for filename in filename_iter: report( file_precompute(analyzer, filename, outdir, type, skip_existing=skip_existing, strip_prefix=strip_prefix)) elif cmd == 'match': # Running query, single-core mode for num, filename in enumerate(filename_iter): msgs = matcher.file_match_to_msgs(analyzer, hash_tab, filename, num) msgs = msgs.message_list report(msgs) elif cmd == 'new' or cmd == 'add': # Adding files tothashes = 0 ix = 0 for filename in filename_iter: report([ time.ctime() + " ingesting #" + str(ix) + ": " + filename + " ..." ]) dur, nhash = analyzer.ingest(hash_tab, filename) tothashes += nhash ix += 1 report([ "Added " + str(tothashes) + " hashes " + "(%.1f" % (tothashes / float(analyzer.soundfiletotaldur)) + " hashes/sec)" ]) elif cmd == 'remove': # Removing files from hash table. for filename in filename_iter: hash_tab.remove(filename) elif cmd == 'list': hash_tab.list(lambda x: report([x])) else: raise ValueError("unrecognized command: " + cmd)
# Keep track of wall time initticks = time.clock() precomp_type = 'hashes' dbasename = args['--dbase'] # Command line sanity. if args["--maxtimebits"]: args["--maxtimebits"] = int(args["--maxtimebits"]) else: args["--maxtimebits"] = hash_table._bitsfor(int(args["--maxtime"])) if cmd in ["new", "newmerge"]: hash_tab = hash_table.HashTable(hashbits=int(args['--hashbits']), depth=int(args['--bucketsize']), maxtime=(1 << int(args['--maxtimebits']))) else: hash_tab = hash_table.HashTable(dbasename) # Create a matcher matcher = afp.setup_matcher(args) filename_iter = afp.filename_list_iterator(args['<file>'], args['--wavdir'], args['--wavext'], args['--list']) # How many processors to use (multiprocessing) ncores = int(args['--ncores']) if ncores > 1: # not for merge, list and remove # merge/newmerge/list/remove are always single-thread processes afp.do_cmd_multiproc(cmd, analyzer,
def new_hashtable(self): hash_tab = hash_table.HashTable(hashbits=int(self.args['--hashbits']), depth=int(self.args['--bucketsize']), maxtime=int(self.args['--maxtime'])) return hash_tab