Ejemplo n.º 1
0
def batchQuery(outf,r,d,ref_db,queries,coord_file,matrix_file,targetNames, embedOnly=False ):

	query_cdb="query.cdb"
	puzzle_file="puzzle"
	queries_cdb="queries.cdb"

	createPuzzleFile(r,d,coord_file,puzzle_file)

	parsing_time = 0
	refine_time = 0
	coord_time,solver = time_function(CoordinateSolver,puzzle_file)
	if not embedOnly:
		lsh_time,lshsearcher = time_function(LSHSearcher,matrix_file,lsh_param)
	else:
		lsh_time=0


	parsing_time += time_function(createQueryCdb,queries,queries_cdb)[0]
	queryNames = [name.strip() for name in file(queries_cdb+".names")]

	debug("targetNames: "+str(targetNames))
	debug("queryNames: "+str(queryNames))

	dist_time,subp = time_function(Popen, [DB2DB_DISTANCE,queries_cdb,ref_db ],stdout=PIPE)

	queryIndex=0
	for line in subp.stdout:
		queryIndex += 1

		t,solverResult = time_function(solver.solve,line)
		coord_time += t
		debug("solverResult: "+solverResult)

		if embedOnly:
			print solverResult
			continue

		t,lshResult = time_function(lshsearcher.search,solverResult)
		lsh_time += t
		lshResult = lshResult.strip()

		info("lshResult %d: %s" % (queryIndex-1,lshResult))

		f=file("query.iddb","w")
		f.write(str(queryIndex)+"\n")
		f.close()
		check_call([DB_SUBSET,queries_cdb,"query.iddb",query_cdb])
		t,distances = time_function(refine,QueryFile(query_cdb),lshResult)
		refine_time+=t

		for candidate_index,dist in distances:
			outf.write("%s\t%s\t%s\n" % 
					(queryNames[queryIndex-1],targetNames[candidate_index-1],dist))
	
	sys.stderr.write('timing: parsing=%s embedding=%s lsh=%s refine=%s \n' %
				(parsing_time, dist_time + coord_time, lsh_time, refine_time))
Ejemplo n.º 2
0
def indexed_search(matrix_file,
                   coord_query_file,
                   output="indexed.gz",
                   evaluation_out="indexed.performance"):
    """perform indexed search"""
    info("running indexed search")

    # create subset from TEST_QUERIES
    # select TEST_QUERIES from matrix file (matrix_file)
    # find neighbors using lshServer
    # refine results
    # compare with gen_chemial_search_results

    def doTest():
        from gzip import GzipFile as zfile
        test_queries_indicies = [
            int(value) - 1 for value in file(TEST_QUERIES)
        ]
        lshsearcher = LSHSearcher(matrix_file, lsh_param)
        out = zfile(output, "w")
        line_num = 0
        for query_coords in file(coord_query_file):
            info("query_coords( " + str(test_queries_indicies[line_num]) +
                 "): " + query_coords)
            candidates = lshsearcher.search(query_coords)
            query_iddb = file("iquery.iddb", "w")
            query_iddb.write("%d\n" % test_queries_indicies[line_num])
            query_iddb.close()
            results = refine(QueryIDDB("iquery.iddb"), candidates)
            os.unlink("iquery.iddb")
            out.write(" ".join(["%d:%f" % pair for pair in results]) + "\n")
            line_num += 1
        out.close()

    test_time = time_function(doTest)[0]

    #clean up stuff left by LSHServer
    if os.path.isfile("coord.in"): os.unlink("coord.in")

    info("total time: %.1f" % test_time)
    cmd = "echo %s > index.search.timing" % test_time
    os_run(cmd)

    gen_chemical_search_results()
    cmd = "%s %s %s > %s" % (INDEXED_SEARCH_EVALUATOR, CHEMICAL_SEARCH_RESULTS,
                             output, evaluation_out)
    os_run(cmd)
    return output
Ejemplo n.º 3
0
def indexed_search(matrix_file, coord_query_file,output="indexed.gz",
		evaluation_out="indexed.performance"):
	"""perform indexed search"""
	info("running indexed search")
	# create subset from TEST_QUERIES
	# select TEST_QUERIES from matrix file (matrix_file)
	# find neighbors using lshServer
	# refine results
	# compare with gen_chemial_search_results

	def doTest():
		from gzip import GzipFile as zfile
		test_queries_indicies =  [int(value)-1 for value in file(TEST_QUERIES)]
		lshsearcher = LSHSearcher(matrix_file,lsh_param)
		out = zfile(output,"w")
		line_num=0
		for query_coords in file(coord_query_file):
			info("query_coords( "+str(test_queries_indicies[line_num])+"): "+query_coords)
			candidates = lshsearcher.search(query_coords)
			query_iddb=file("iquery.iddb","w")
			query_iddb.write("%d\n" % test_queries_indicies[line_num])
			query_iddb.close()
			results = refine(QueryIDDB("iquery.iddb"),candidates)
			os.unlink("iquery.iddb")
			out.write(" ".join(["%d:%f" % pair for pair in results])+"\n")
			line_num+=1
		out.close()

	test_time = time_function(doTest)[0]

	#clean up stuff left by LSHServer
	if os.path.isfile("coord.in"): os.unlink("coord.in") 

	info( "total time: %.1f" % test_time )
	cmd = "echo %s > index.search.timing" % test_time 
	os_run(cmd)

	gen_chemical_search_results()
	cmd = "%s %s %s > %s" % (INDEXED_SEARCH_EVALUATOR, 
			CHEMICAL_SEARCH_RESULTS, output, evaluation_out)
	os_run(cmd)
	return output
Ejemplo n.º 4
0
def batchQuery(outf,
               r,
               d,
               ref_db,
               queries,
               coord_file,
               matrix_file,
               targetNames,
               embedOnly=False):

    query_cdb = "query.cdb"
    puzzle_file = "puzzle"
    queries_cdb = "queries.cdb"

    createPuzzleFile(r, d, coord_file, puzzle_file)

    parsing_time = 0
    refine_time = 0
    coord_time, solver = time_function(CoordinateSolver, puzzle_file)
    if not embedOnly:
        lsh_time, lshsearcher = time_function(LSHSearcher, matrix_file,
                                              lsh_param)
    else:
        lsh_time = 0

    parsing_time += time_function(createQueryCdb, queries, queries_cdb)[0]
    queryNames = [name.strip() for name in file(queries_cdb + ".names")]

    debug("targetNames: " + str(targetNames))
    debug("queryNames: " + str(queryNames))

    dist_time, subp = time_function(Popen,
                                    [DB2DB_DISTANCE, queries_cdb, ref_db],
                                    stdout=PIPE)

    queryIndex = 0
    for line in subp.stdout:
        queryIndex += 1

        t, solverResult = time_function(solver.solve, line)
        coord_time += t
        debug("solverResult: " + solverResult)

        if embedOnly:
            print solverResult
            continue

        t, lshResult = time_function(lshsearcher.search, solverResult)
        lsh_time += t
        lshResult = lshResult.strip()

        info("lshResult %d: %s" % (queryIndex - 1, lshResult))

        f = file("query.iddb", "w")
        f.write(str(queryIndex) + "\n")
        f.close()
        check_call([DB_SUBSET, queries_cdb, "query.iddb", query_cdb])
        t, distances = time_function(refine, QueryFile(query_cdb), lshResult)
        refine_time += t

        for candidate_index, dist in distances:
            outf.write("%s\t%s\t%s\n" %
                       (queryNames[queryIndex - 1],
                        targetNames[candidate_index - 1], dist))

    sys.stderr.write(
        'timing: parsing=%s embedding=%s lsh=%s refine=%s \n' %
        (parsing_time, dist_time + coord_time, lsh_time, refine_time))