def main(): global POST input_ = sys.argv[1] DEBUG_MODE_=DEBUG_MODE if len(sys.argv) > 2: DEBUG_MODE_=True (intervalq,rangeq,sampleq,idq) = (None,None,None,None) #(intervalq,rangeq,sampleq,idq) = ([],[],[],[]) sys.stderr.write("%s\n" % input_) if input_[0] == '[' or input_[1] == '[' or input_[2] == '[': (intervalq,rangeq,sampleq,idq) = parse_json_query(input_) POST=True #update support simple '&' CGI format else: # (intervalq,rangeq,sampleq,idq) = input_.split('|') (intervalq,idq,rangeq,sampleq) = process_params(input_) sample_map = snample.load_sample_metadata(snapconf.SAMPLE_MD_FILE) if DEBUG_MODE_: sys.stderr.write("loaded %d samples metadata\n" % (len(sample_map))) #first we build filter-by-snaptron_id list based either (or all) on passed ids directly #and/or what's dervied from the sample query and/or what sample ids were passed in as well #NOTE this is the only place where we have OR logic, i.e. the set of snaptron_ids passed in #and the set of snaptron_ids dervied from the passed in sample_ids are OR'd together in the filtering snaptron_ids = set() if len(idq) >= 1: query_ids(idq,snaptron_ids) #if we have any sample related queries, do them to get snaptron_id filter set #NOTE we are NOT currently support sample-id querying if len(sampleq) >= 1: snaptron_ids = snample.query_samples(sampleq,sample_map,snaptron_ids) #end result here is that we have a list of snaptron_ids to filter by #or if no snaptron_ids were found we're done, in keeping with the strict AND policy (currently) #TODO: update this when we start supporting OR in the POSTs, this will need to change if len(snaptron_ids) == 0 and (len(idq) >=1 or len(sampleq) >= 1): return #NOW start normal query processing between: 1) interval 2) range or 3) or just snaptron ids #note: 1) and 3) use tabix, 2) uses lucene #sample_set = set() #UPDATE: prefer tabix queries of either interval or snaptron_ids rather than lucene search of range queries due to speed #if len(snaptron_ids) > 0 and len(intervalq) == 0 and (len(rangeq) == 0 or not first_tdb): #back to usual processing, interval queries come first possibly with filters from the point range queries and/or ids found_snaptron_ids = set() found_sample_ids = set() if len(intervalq) >= 1: (found_snaptron_ids,found_sample_ids) = query_regions(intervalq,rangeq,snaptron_ids,filtering=RESULT_COUNT) elif len(snaptron_ids) >= 1: rquery = range_query_parser(rangeq,snaptron_ids) (found_snaptron_ids,found_sample_ids) = search_introns_by_ids(snaptron_ids,rquery,filtering=RESULT_COUNT) #finally if there's no interval OR id query to use with tabix, use a point range query (first_rquery) with additional filters from the following point range queries and/or ids in lucene elif len(rangeq) >= 1: #run_tabix(first_rquery,rquery,first_tdb,filter_set=snaptron_ids,sample_set=sample_set,debug=DEBUG_MODE_) (found_snaptron_ids,found_sample_ids) = search_ranges_lucene(rangeq,snaptron_ids,stream_back=True,filtering=RESULT_COUNT) if RESULT_COUNT: sys.stdout.write("%d\n" % (len(found_snaptron_ids)))
def main(): input_ = sys.argv[1] DEBUG_MODE_=DEBUG_MODE global FORCE_SQLITE global FORCE_TABIX if len(sys.argv) == 3: DEBUG_MODE_=True if len(sys.argv) == 4: FORCE_SQLITE=True if len(sys.argv) == 5: FORCE_TABIX=True (intervalq,rangeq,idq) = (None,None,None) sampleq = [] #(intervalq,rangeq,sampleq,idq) = ([],[],[],[]) sys.stderr.write("%s\n" % input_) sample_map = snample.load_sample_metadata(snapconf.SAMPLE_MD_FILE) if DEBUG_MODE_: sys.stderr.write("loaded %d samples metadata\n" % (len(sample_map))) #make copy of the region_args tuple ra = default_region_args if '[' in input_: (or_intervals,or_ranges,or_samples,or_ids,ra) = process_post_params(input_) #(intervalq,rangeq,sampleq,idq) = (or_intervals[0],or_ranges[0],or_samples[0],or_ids[0]) for idx in (xrange(0,len(or_intervals))): run_toplevel_AND_query(or_intervals[idx],or_ranges[idx],or_samples[idx],or_ids[idx],sample_map=sample_map,ra=ra) ra=ra._replace(print_header=False) #update support simple '&' CGI format else: (intervalq,idq,rangeq,sampleq,ra) = process_params(input_) run_toplevel_AND_query(intervalq,rangeq,sampleq,idq,sample_map=sample_map,ra=ra)
def main(): input_ = sys.argv[1] DEBUG_MODE_ = DEBUG_MODE global FORCE_SQLITE global FORCE_TABIX if len(sys.argv) == 3: DEBUG_MODE_ = True if len(sys.argv) == 4: FORCE_SQLITE = True if len(sys.argv) == 5: FORCE_TABIX = True (intervalq, rangeq, idq) = (None, None, None) sampleq = [] #(intervalq,rangeq,sampleq,idq) = ([],[],[],[]) sys.stderr.write("%s\n" % input_) sample_map = snample.load_sample_metadata(snapconf.SAMPLE_MD_FILE) if DEBUG_MODE_: sys.stderr.write("loaded %d samples metadata\n" % (len(sample_map))) #make copy of the region_args tuple ra = default_region_args if '[' in input_: (or_intervals, or_ranges, or_samples, or_ids, ra) = process_post_params(input_) #(intervalq,rangeq,sampleq,idq) = (or_intervals[0],or_ranges[0],or_samples[0],or_ids[0]) for idx in (xrange(0, len(or_intervals))): run_toplevel_AND_query(or_intervals[idx], or_ranges[idx], or_samples[idx], or_ids[idx], sample_map=sample_map, ra=ra) ra = ra._replace(print_header=False) #update support simple '&' CGI format else: (intervalq, idq, rangeq, sampleq, ra) = process_params(input_) run_toplevel_AND_query(intervalq, rangeq, sampleq, idq, sample_map=sample_map, ra=ra)
def main(): input_ = sys.argv[1] DEBUG_MODE_=DEBUG_MODE global FORCE_SQLITE global FORCE_TABIX if len(sys.argv) == 3: DEBUG_MODE_=True if len(sys.argv) == 4: FORCE_SQLITE=True if len(sys.argv) == 5: FORCE_TABIX=True (intervalq,rangeq,idq) = (None,None,None) sampleq = [] #(intervalq,rangeq,sampleq,idq) = ([],[],[],[]) sys.stderr.write("%s\n" % input_) sample_map = snample.load_sample_metadata(snapconf.SAMPLE_MD_FILE) if DEBUG_MODE_: sys.stderr.write("loaded %d samples metadata\n" % (len(sample_map))) #make copy of the region_args tuple ra = default_region_args #bulk query mode #somewhat ad hoc, but with the first test #trying to avoid a pattern search across the whole input string #which could be large if input_[:6] == 'group=' or 'group=' in input_: for query in re.split(snapconfshared.BULK_QUERY_DELIMITER,input_): (intervalq,idq,rangeq,sampleq,ra) = process_params(query) run_toplevel_AND_query(intervalq,rangeq,sampleq,idq,sample_map=sample_map,ra=ra) elif '[' in input_: (or_intervals,or_ranges,or_samples,or_ids,ra) = process_post_params(input_) #(intervalq,rangeq,sampleq,idq) = (or_intervals[0],or_ranges[0],or_samples[0],or_ids[0]) for idx in (xrange(0,len(or_intervals))): run_toplevel_AND_query(or_intervals[idx],or_ranges[idx],or_samples[idx],or_ids[idx],sample_map=sample_map,ra=ra) ra=ra._replace(print_header=False) #update support simple '&' CGI format else: (intervalq,idq,rangeq,sampleq,ra) = process_params(input_) run_toplevel_AND_query(intervalq,rangeq,sampleq,idq,sample_map=sample_map,ra=ra)