def main(): global POST input_ = sys.argv[1] DEBUG_MODE_=DEBUG_MODE if len(sys.argv) > 2: DEBUG_MODE_=True (intervalq,rangeq,sampleq,idq) = (None,None,None,None) #(intervalq,rangeq,sampleq,idq) = ([],[],[],[]) sys.stderr.write("%s\n" % input_) if input_[0] == '[' or input_[1] == '[' or input_[2] == '[': (intervalq,rangeq,sampleq,idq) = parse_json_query(input_) POST=True #update support simple '&' CGI format else: # (intervalq,rangeq,sampleq,idq) = input_.split('|') (intervalq,idq,rangeq,sampleq) = process_params(input_) sample_map = snample.load_sample_metadata(snapconf.SAMPLE_MD_FILE) if DEBUG_MODE_: sys.stderr.write("loaded %d samples metadata\n" % (len(sample_map))) #first we build filter-by-snaptron_id list based either (or all) on passed ids directly #and/or what's dervied from the sample query and/or what sample ids were passed in as well #NOTE this is the only place where we have OR logic, i.e. the set of snaptron_ids passed in #and the set of snaptron_ids dervied from the passed in sample_ids are OR'd together in the filtering snaptron_ids = set() if len(idq) >= 1: query_ids(idq,snaptron_ids) #if we have any sample related queries, do them to get snaptron_id filter set #NOTE we are NOT currently support sample-id querying if len(sampleq) >= 1: snaptron_ids = snample.query_samples(sampleq,sample_map,snaptron_ids) #end result here is that we have a list of snaptron_ids to filter by #or if no snaptron_ids were found we're done, in keeping with the strict AND policy (currently) #TODO: update this when we start supporting OR in the POSTs, this will need to change if len(snaptron_ids) == 0 and (len(idq) >=1 or len(sampleq) >= 1): return #NOW start normal query processing between: 1) interval 2) range or 3) or just snaptron ids #note: 1) and 3) use tabix, 2) uses lucene #sample_set = set() #UPDATE: prefer tabix queries of either interval or snaptron_ids rather than lucene search of range queries due to speed #if len(snaptron_ids) > 0 and len(intervalq) == 0 and (len(rangeq) == 0 or not first_tdb): #back to usual processing, interval queries come first possibly with filters from the point range queries and/or ids found_snaptron_ids = set() found_sample_ids = set() if len(intervalq) >= 1: (found_snaptron_ids,found_sample_ids) = query_regions(intervalq,rangeq,snaptron_ids,filtering=RESULT_COUNT) elif len(snaptron_ids) >= 1: rquery = range_query_parser(rangeq,snaptron_ids) (found_snaptron_ids,found_sample_ids) = search_introns_by_ids(snaptron_ids,rquery,filtering=RESULT_COUNT) #finally if there's no interval OR id query to use with tabix, use a point range query (first_rquery) with additional filters from the following point range queries and/or ids in lucene elif len(rangeq) >= 1: #run_tabix(first_rquery,rquery,first_tdb,filter_set=snaptron_ids,sample_set=sample_set,debug=DEBUG_MODE_) (found_snaptron_ids,found_sample_ids) = search_ranges_lucene(rangeq,snaptron_ids,stream_back=True,filtering=RESULT_COUNT) if RESULT_COUNT: sys.stdout.write("%d\n" % (len(found_snaptron_ids)))
def run_toplevel_AND_query(intervalq,rangeq,sampleq,idq,sample_map=[],ra=default_region_args): #first we build filter-by-snaptron_id list based either (or all) on passed ids directly #and/or what's dervied from the sample query and/or what sample ids were passed in as well #NOTE this is the only place where we have OR logic, i.e. the set of snaptron_ids passed in #and the set of snaptron_ids dervied from the passed in sample_ids are OR'd together in the filtering snaptron_ids = set() if len(idq) >= 1: query_ids(idq,snaptron_ids) #if we have any sample related queries, do them to get snaptron_id filter set #NOTE we are NOT currently support sample-id querying if len(sampleq) >= 1: snaptron_ids = snample.query_samples(sampleq,sample_map,snaptron_ids,ra) #end result here is that we have a list of snaptron_ids to filter by #or if no snaptron_ids were found we're done, in keeping with the strict AND policy (currently) #TODO: update this when we start supporting OR in the POSTs, this will need to change if len(snaptron_ids) == 0 and (len(idq) >=1 or len(sampleq) >= 1): return #NOW start normal query processing between: 1) interval 2) range or 3) or just snaptron ids #note: 1) and 3) use tabix, 2) uses lucene #sample_set = set() #UPDATE: prefer tabix queries of either interval or snaptron_ids rather than lucene search of range queries due to speed #if len(snaptron_ids) > 0 and len(intervalq) == 0 and (len(rangeq) == 0 or not first_tdb): #back to usual processing, interval queries come first possibly with filters from the point range queries and/or ids found_snaptron_ids = set() found_sample_ids = set() #favor intervals over everything else if len(intervalq) >= 1: (found_snaptron_ids,found_sample_ids) = query_regions(intervalq,rangeq,snaptron_ids,filtering=ra.result_count,region_args=ra) elif len(snaptron_ids) >= 1: rquery = range_query_parser(rangeq,snaptron_ids) ra_ = ra._replace(tabix_db_file=snapconf.TABIX_DBS['snaptron_id'],stream_back=True) (found_snaptron_ids,found_sample_ids) = search_introns_by_ids(snaptron_ids,rquery,filtering=ra_.result_count,region_args=ra_) #finally if there's no interval OR id query to use with tabix, use a point range query (first_rquery) with additional filters from the following point range queries and/or ids in lucene elif len(rangeq) >= 1: #run_tabix(first_rquery,rquery,first_tdb,filter_set=snaptron_ids,sample_set=sample_set,debug=DEBUG_MODE_) #(found_snaptron_ids,found_sample_ids) = search_ranges_lucene(rangeq,snaptron_ids,stream_back=True,filtering=RESULT_COUNT) (found_snaptron_ids,found_sample_ids) = run_sqlite3(None,rangeq,snaptron_ids,region_args=ra) if ra.result_count: sys.stdout.write("%d\n" % (len(found_snaptron_ids)))