def batchQuery(outf,r,d,ref_db,queries,coord_file,matrix_file,targetNames, embedOnly=False ): query_cdb="query.cdb" puzzle_file="puzzle" queries_cdb="queries.cdb" createPuzzleFile(r,d,coord_file,puzzle_file) parsing_time = 0 refine_time = 0 coord_time,solver = time_function(CoordinateSolver,puzzle_file) if not embedOnly: lsh_time,lshsearcher = time_function(LSHSearcher,matrix_file,lsh_param) else: lsh_time=0 parsing_time += time_function(createQueryCdb,queries,queries_cdb)[0] queryNames = [name.strip() for name in file(queries_cdb+".names")] debug("targetNames: "+str(targetNames)) debug("queryNames: "+str(queryNames)) dist_time,subp = time_function(Popen, [DB2DB_DISTANCE,queries_cdb,ref_db ],stdout=PIPE) queryIndex=0 for line in subp.stdout: queryIndex += 1 t,solverResult = time_function(solver.solve,line) coord_time += t debug("solverResult: "+solverResult) if embedOnly: print solverResult continue t,lshResult = time_function(lshsearcher.search,solverResult) lsh_time += t lshResult = lshResult.strip() info("lshResult %d: %s" % (queryIndex-1,lshResult)) f=file("query.iddb","w") f.write(str(queryIndex)+"\n") f.close() check_call([DB_SUBSET,queries_cdb,"query.iddb",query_cdb]) t,distances = time_function(refine,QueryFile(query_cdb),lshResult) refine_time+=t for candidate_index,dist in distances: outf.write("%s\t%s\t%s\n" % (queryNames[queryIndex-1],targetNames[candidate_index-1],dist)) sys.stderr.write('timing: parsing=%s embedding=%s lsh=%s refine=%s \n' % (parsing_time, dist_time + coord_time, lsh_time, refine_time))
def indexed_search(matrix_file, coord_query_file, output="indexed.gz", evaluation_out="indexed.performance"): """perform indexed search""" info("running indexed search") # create subset from TEST_QUERIES # select TEST_QUERIES from matrix file (matrix_file) # find neighbors using lshServer # refine results # compare with gen_chemial_search_results def doTest(): from gzip import GzipFile as zfile test_queries_indicies = [ int(value) - 1 for value in file(TEST_QUERIES) ] lshsearcher = LSHSearcher(matrix_file, lsh_param) out = zfile(output, "w") line_num = 0 for query_coords in file(coord_query_file): info("query_coords( " + str(test_queries_indicies[line_num]) + "): " + query_coords) candidates = lshsearcher.search(query_coords) query_iddb = file("iquery.iddb", "w") query_iddb.write("%d\n" % test_queries_indicies[line_num]) query_iddb.close() results = refine(QueryIDDB("iquery.iddb"), candidates) os.unlink("iquery.iddb") out.write(" ".join(["%d:%f" % pair for pair in results]) + "\n") line_num += 1 out.close() test_time = time_function(doTest)[0] #clean up stuff left by LSHServer if os.path.isfile("coord.in"): os.unlink("coord.in") info("total time: %.1f" % test_time) cmd = "echo %s > index.search.timing" % test_time os_run(cmd) gen_chemical_search_results() cmd = "%s %s %s > %s" % (INDEXED_SEARCH_EVALUATOR, CHEMICAL_SEARCH_RESULTS, output, evaluation_out) os_run(cmd) return output
def indexed_search(matrix_file, coord_query_file,output="indexed.gz", evaluation_out="indexed.performance"): """perform indexed search""" info("running indexed search") # create subset from TEST_QUERIES # select TEST_QUERIES from matrix file (matrix_file) # find neighbors using lshServer # refine results # compare with gen_chemial_search_results def doTest(): from gzip import GzipFile as zfile test_queries_indicies = [int(value)-1 for value in file(TEST_QUERIES)] lshsearcher = LSHSearcher(matrix_file,lsh_param) out = zfile(output,"w") line_num=0 for query_coords in file(coord_query_file): info("query_coords( "+str(test_queries_indicies[line_num])+"): "+query_coords) candidates = lshsearcher.search(query_coords) query_iddb=file("iquery.iddb","w") query_iddb.write("%d\n" % test_queries_indicies[line_num]) query_iddb.close() results = refine(QueryIDDB("iquery.iddb"),candidates) os.unlink("iquery.iddb") out.write(" ".join(["%d:%f" % pair for pair in results])+"\n") line_num+=1 out.close() test_time = time_function(doTest)[0] #clean up stuff left by LSHServer if os.path.isfile("coord.in"): os.unlink("coord.in") info( "total time: %.1f" % test_time ) cmd = "echo %s > index.search.timing" % test_time os_run(cmd) gen_chemical_search_results() cmd = "%s %s %s > %s" % (INDEXED_SEARCH_EVALUATOR, CHEMICAL_SEARCH_RESULTS, output, evaluation_out) os_run(cmd) return output
def batchQuery(outf, r, d, ref_db, queries, coord_file, matrix_file, targetNames, embedOnly=False): query_cdb = "query.cdb" puzzle_file = "puzzle" queries_cdb = "queries.cdb" createPuzzleFile(r, d, coord_file, puzzle_file) parsing_time = 0 refine_time = 0 coord_time, solver = time_function(CoordinateSolver, puzzle_file) if not embedOnly: lsh_time, lshsearcher = time_function(LSHSearcher, matrix_file, lsh_param) else: lsh_time = 0 parsing_time += time_function(createQueryCdb, queries, queries_cdb)[0] queryNames = [name.strip() for name in file(queries_cdb + ".names")] debug("targetNames: " + str(targetNames)) debug("queryNames: " + str(queryNames)) dist_time, subp = time_function(Popen, [DB2DB_DISTANCE, queries_cdb, ref_db], stdout=PIPE) queryIndex = 0 for line in subp.stdout: queryIndex += 1 t, solverResult = time_function(solver.solve, line) coord_time += t debug("solverResult: " + solverResult) if embedOnly: print solverResult continue t, lshResult = time_function(lshsearcher.search, solverResult) lsh_time += t lshResult = lshResult.strip() info("lshResult %d: %s" % (queryIndex - 1, lshResult)) f = file("query.iddb", "w") f.write(str(queryIndex) + "\n") f.close() check_call([DB_SUBSET, queries_cdb, "query.iddb", query_cdb]) t, distances = time_function(refine, QueryFile(query_cdb), lshResult) refine_time += t for candidate_index, dist in distances: outf.write("%s\t%s\t%s\n" % (queryNames[queryIndex - 1], targetNames[candidate_index - 1], dist)) sys.stderr.write( 'timing: parsing=%s embedding=%s lsh=%s refine=%s \n' % (parsing_time, dist_time + coord_time, lsh_time, refine_time))