def test(): button_wavsound = wavsound("button.wav") haystackss = [] # split database into list of smaller database keynames = [] db_size = 300 # Set Database Size num_split_db = 2 # Set number of split databases size_split_db = int(db_size / num_split_db) for i in range(num_split_db): haystackss.append([]) counter = 0 for i in range(db_size): split_db_key = int(counter / size_split_db) keynames.append(i) haystackss[split_db_key].append(haystack(i, button_wavsound.get_data())) counter += 1 # haystacks.append(haystack("7",[1, 2, 3, 4, 5])) button_needle_factory = needlestorage(button_wavsound, 1000, 50) emissions = [] print("USING MAP PROCESS and Manager") needles = button_needle_factory.get_needles() print(needles[0]) manager = Manager() return_emissions = manager.dict() jobs = [] pnum = 0 # number of needles not size of each needle len_needles = len(needles) print("Number of Needles: ", len_needles) start_time = time.time() for needle in needles: for haystacks in haystackss: p = Process( target=calltomapper, args=(haystacks, needle, pnum, len_needles * num_split_db, return_emissions) ) jobs.append(p) p.start() pnum += 1 print(time.time() - start_time) for proc in jobs: proc.join() # wait for each process to end completely print(time.time() - start_time) emissions_list = sum(return_emissions.values(), []) print("Reduce Result:") print(haystackreducer(emissions_list, keynames)) print("Done") print(time.time() - start_time) """ This is a pool implementation of parallel processing, it has been commented out as it was slower than the Process method print(button_wavsound) print("Utilizing MapReduce Pattern") pool = Pool(2) # if it is a quad-core machine it can be set to 4 print(button_needle_factory.get_needles()) emissions = pool.map(haystackmap.mapper, button_needle_factory.get_needles()) print(emissions) print(haystackreducer(sum(emissions,[]))) emissions = [] """ """ The algorithm below is a serial method, no optimization """ """
def run(query, sample_length, samples, rootdir, max_split): """ run runs the repository search taking three user inputs, the query wav file, sample_length, and number of partition samples""" #Instantiate Wavsound objects from the wav files t_wavsounds = {} query_wavsound = wavsound(query) # repository Structure haystackss = [] # split repository into list of smaller repository key_names = [] # repository Spliting Parameters (1 to number of repository entries) db_size_per_split = 100 for i in range(max_split): haystackss.append([]) # Read Files in the DB counter = 0 for subdir, __, files in os.walk(rootdir): for file in files: key_names.append(subdir+"/"+file) split_db_key = min(max_split, int(counter / db_size_per_split)) t_wavsounds[subdir+"/"+file] = wavsound(subdir+"/"+file) haystackss[split_db_key].append(haystack(subdir+"/"+file,t_wavsounds[subdir+"/"+file].get_data()[::16])) counter += 1 query_needle_factory = needlestorage(query_wavsound,sample_length,int(samples)) # Get segments of the query data as needles needles = query_needle_factory.get_needles() #print("...", len(needles), "needles") query_needle_factory.clear_needles() # MAP -------------------------------------------------- # Manager to keep track of all map results manager = Manager() # Map processes emit key-value pairs to emissions return_emissions = manager.list() # Job is a list of processes jobs = [] # Process number pnum = 0 #Distribute processes using multiprocessor len_needles = len(needles) for needle in needles: for haystacks in haystackss: if haystacks != []: #print(len_needles) p = Process(target=calltomapper, args=(haystacks,needle,pnum,len_needles*len(haystackss),return_emissions)) jobs.append(p) p.start() pnum += 1 for proc in jobs: proc.join() # SHUFFLE/REDUCE ------------------------------------------ # Job is a list of processes jobs = [] # Manager to keep track of all map results manager_2 = Manager() result_dict = manager_2.dict() for key in key_names: key_list = [1 for x in return_emissions if x[0] == key] print (key, key_list) q = Process(target=calltoreducer, args=(key_list, key, result_dict)) jobs.append(q) q.start() for proc in jobs: proc.join() result_lst = [] print(len(needles), "is length of needles") if len(result_dict.items()) != 0: for key, value in sorted(result_dict.items(), key=lambda pair: pair[1], reverse=True): if value > 0: result_lst.append([str(key), str((int(value)/len(needles)*100))]) needles = [] return result_lst
def test(): button_wavsound = wavsound('button.wav') haystackss = [] # split database into list of smaller database keynames = [] db_size = 300 # Set Database Size num_split_db = 2 # Set number of split databases size_split_db = int(db_size / num_split_db) for i in range(num_split_db): haystackss.append([]) counter = 0 for i in range(db_size): split_db_key = int(counter / size_split_db) keynames.append(i) haystackss[split_db_key].append(haystack(i, button_wavsound.get_data())) counter += 1 #haystacks.append(haystack("7",[1, 2, 3, 4, 5])) button_needle_factory = needlestorage(button_wavsound, 1000, 50) emissions = [] print("USING MAP PROCESS and Manager") needles = button_needle_factory.get_needles() print(needles[0]) manager = Manager() return_emissions = manager.dict() jobs = [] pnum = 0 # number of needles not size of each needle len_needles = len(needles) print("Number of Needles: ", len_needles) start_time = time.time() for needle in needles: for haystacks in haystackss: p = Process(target=calltomapper, args=(haystacks, needle, pnum, len_needles * num_split_db, return_emissions)) jobs.append(p) p.start() pnum += 1 print(time.time() - start_time) for proc in jobs: proc.join() # wait for each process to end completely print(time.time() - start_time) emissions_list = sum(return_emissions.values(), []) print("Reduce Result:") print(haystackreducer(emissions_list, keynames)) print("Done") print(time.time() - start_time) """ This is a pool implementation of parallel processing, it has been commented out as it was slower than the Process method print(button_wavsound) print("Utilizing MapReduce Pattern") pool = Pool(2) # if it is a quad-core machine it can be set to 4 print(button_needle_factory.get_needles()) emissions = pool.map(haystackmap.mapper, button_needle_factory.get_needles()) print(emissions) print(haystackreducer(sum(emissions,[]))) emissions = [] """ """ The algorithm below is a serial method, no optimization """ """
def run(): """ run runs the database search taking three user inputs, the query wav file, number of partitions, and number of partition samples""" good_file = 0 while (good_file == 0): query = raw_input( "Submit .wav file to search against database (Example: button.wav): " ) if (os.path.isfile(query)): good_file = 1 #Instantiate Wavsound objects from the wav files t_wavsounds = {} query_wavsound = wavsound(query) print( "\n**Higher number of partitions increases false positive rates, \nwhile lower number of partitions increases false negative rates\n" ) partition = raw_input("Set number of partitions of the query from 1 to " + str(int(len(query_wavsound.get_data()) / 3)) + ": ") samples = raw_input("Set number of samples of partitions from 1 to " + partition + " (Recommend < 50): ") # Database Structure haystacks = [] # Database look up directory rootdir = 'db' for subdir, __, files in os.walk(rootdir): for file in files: # for debug print (subdir+"/"+file) t_wavsounds[subdir + "/" + file] = wavsound(subdir + "/" + file) # for debug print(t_wavsounds[subdir+"/"+file]) haystacks.append( haystack(subdir + "/" + file, t_wavsounds[subdir + "/" + file].get_data())) query_needle_factory = needlestorage(query_wavsound, int(partition), int(samples)) haystackmap = haystackmapper(haystacks) needles = query_needle_factory.get_needles() len_needles = len(needles) len_needle = len(needles[0]) # size is the same for all needles manager = Manager() # Map processes emit key-value pairs to emissions return_emissions = manager.dict() # Job is a list of processes jobs = [] # Process number pnum = 0 print "Number of Needles: ", len(needles) # Database query time start_time = time.time() #Distribute processes using multiprocessor for needle in needles: p = Process(target=calltomapper, args=(haystackmap, needle, pnum, len_needles, return_emissions)) jobs.append(p) p.start() pnum += 1 for proc in jobs: proc.join() # flatten return_emissions into a list emissions_list = sum(return_emissions.values(), []) print "Search Result:" result_dict = haystackreducer(emissions_list) # Tabulate % match (wav files with 0% match are excluded from the result) for key in result_dict: print str(key), ": ", (25 - len(str(key))) * " ", str("{0:.2f}".format( int(result_dict[key]) / len(needles) * 100)), "% match" # Show search time timelapse_parallel = time.time() - start_time print timelapse_parallel, "seconds"
def run(query, sample_length, samples, rootdir, max_split): """ run runs the repository search taking three user inputs, the query wav file, sample_length, and number of partition samples""" #Instantiate Wavsound objects from the wav files t_wavsounds = {} query_wavsound = wavsound(query) # repository Structure haystackss = [] # split repository into list of smaller repository key_names = [] # repository Spliting Parameters (1 to number of repository entries) db_size_per_split = 100 for i in range(max_split): haystackss.append([]) # Read Files in the DB counter = 0 for subdir, __, files in os.walk(rootdir): for file in files: key_names.append(subdir + "/" + file) split_db_key = min(max_split, int(counter / db_size_per_split)) t_wavsounds[subdir + "/" + file] = wavsound(subdir + "/" + file) haystackss[split_db_key].append( haystack(subdir + "/" + file, t_wavsounds[subdir + "/" + file].get_data()[::16])) counter += 1 query_needle_factory = needlestorage(query_wavsound, sample_length, int(samples)) # Get segments of the query data as needles needles = query_needle_factory.get_needles() #print("...", len(needles), "needles") query_needle_factory.clear_needles() # MAP -------------------------------------------------- # Manager to keep track of all map results manager = Manager() # Map processes emit key-value pairs to emissions return_emissions = manager.list() # Job is a list of processes jobs = [] # Process number pnum = 0 #Distribute processes using multiprocessor len_needles = len(needles) for needle in needles: for haystacks in haystackss: if haystacks != []: #print(len_needles) p = Process(target=calltomapper, args=(haystacks, needle, pnum, len_needles * len(haystackss), return_emissions)) jobs.append(p) p.start() pnum += 1 for proc in jobs: proc.join() # SHUFFLE/REDUCE ------------------------------------------ # Job is a list of processes jobs = [] # Manager to keep track of all map results manager_2 = Manager() result_dict = manager_2.dict() for key in key_names: key_list = [1 for x in return_emissions if x[0] == key] print(key, key_list) q = Process(target=calltoreducer, args=(key_list, key, result_dict)) jobs.append(q) q.start() for proc in jobs: proc.join() result_lst = [] print(len(needles), "is length of needles") if len(result_dict.items()) != 0: for key, value in sorted(result_dict.items(), key=lambda pair: pair[1], reverse=True): if value > 0: result_lst.append( [str(key), str((int(value) / len(needles) * 100))]) needles = [] return result_lst
def run(): """ run runs the database search taking three user inputs, the query wav file, number of partitions, and number of partition samples""" good_file = 0 while (good_file == 0): query = raw_input("Submit .wav file to search against database (Example: button.wav): ") if (os.path.isfile(query)): good_file = 1 #Instantiate Wavsound objects from the wav files t_wavsounds = {} query_wavsound = wavsound(query) print("\n**Higher number of partitions increases false positive rates, \nwhile lower number of partitions increases false negative rates\n") partition = raw_input("Set number of partitions of the query from 1 to " + str(int(len(query_wavsound.get_data())/3))+": ") samples = raw_input("Set number of samples of partitions from 1 to " + partition + " (Recommend < 50): ") # Database Structure haystacks = [] # Database look up directory rootdir = 'db' for subdir, __, files in os.walk(rootdir): for file in files: # for debug print (subdir+"/"+file) t_wavsounds[subdir+"/"+file] = wavsound(subdir+"/"+file) # for debug print(t_wavsounds[subdir+"/"+file]) haystacks.append(haystack(subdir+"/"+file,t_wavsounds[subdir+"/"+file].get_data())) query_needle_factory = needlestorage(query_wavsound,int(partition),int(samples)) haystackmap = haystackmapper(haystacks) needles = query_needle_factory.get_needles() len_needles = len(needles) len_needle = len(needles[0]) # size is the same for all needles manager = Manager() # Map processes emit key-value pairs to emissions return_emissions = manager.dict() # Job is a list of processes jobs = [] # Process number pnum = 0 print "Number of Needles: ", len(needles) # Database query time start_time = time.time() #Distribute processes using multiprocessor for needle in needles: p = Process(target=calltomapper, args=(haystackmap,needle,pnum,len_needles,return_emissions)) jobs.append(p) p.start() pnum += 1 for proc in jobs: proc.join() # flatten return_emissions into a list emissions_list = sum(return_emissions.values(),[]) print "Search Result:" result_dict = haystackreducer(emissions_list) # Tabulate % match (wav files with 0% match are excluded from the result) for key in result_dict: print str(key),": ",(25-len(str(key)))*" ",str("{0:.2f}".format(int(result_dict[key])/len(needles)*100)),"% match" # Show search time timelapse_parallel = time.time() - start_time print timelapse_parallel, "seconds"