def _string_equality_queries(self, field, dist, pdf_lower, pdf_upper): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for _ in xrange(CLAUSE_OVER_GENERATION_RATIO): value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {}) qid = qids.query_id() where = 'SUBSTR(%s,1,9) = \'\'%s\'\'' % (sv.sql_info[field][0], value[:9]) query_dicts.append({qs.QRY_ENUM : qs.CAT.P7_FINAL, qs.QRY_QID : qids.full_where_has_been_seen(qid, where), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'P7', qs.QRY_SUBCAT : 'final', qs.QRY_WHERECLAUSE : where, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_KEYWORDLEN : 9, qs.QRY_SEARCHFOR : value[:9], qs.QRY_SEARCHDELIMNUM : 1 }) return aqb.WildcardQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def _equality_queries(self, field, dist, pdf_lower, pdf_upper): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for x in xrange(CLAUSE_OVER_GENERATION_RATIO): value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {}) qid = qids.query_id() (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where(field, value) query_dicts.append({qs.QRY_ENUM : qs.CAT.EQ, qs.QRY_QID : qids.full_where_has_been_seen(qid, where), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'EQ', qs.QRY_SUBCAT : '', qs.QRY_WHERECLAUSE : where, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_VALUE : value}) return aqb.EqualityQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def refine_queries(self, agg_result): """ Takes in 'agg_result' which is the result from the aggregator for this BOQ. Selects which queries should be recorded in the results database. To do this it creates a new list of associated selected queries and pairs them with their results. """ #selecting queries that match. queries = [] assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS]) for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]): assert q assert r assert q[qs.QRY_QID] >= r[qs.QRY_QID] potential_queries = [] for (value, value_result) in r[qs.QRY_FISHING_MATCHES_FOUND].iteritems(): count = len(value_result) if qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count and\ qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count: (value, where) = self.format_value_and_where( sv.sql_name_to_enum(q[qs.QRY_FIELD]), value) q[qs.QRY_VALUE] = value q[qs.QRY_WHERECLAUSE] = where r[rdb.DBF_MATCHINGRECORDIDS] = value_result potential_queries.append((q, r)) if potential_queries: chosen_q = random.sample(potential_queries, 1)[0] chosen_q[0][qs.QRY_QID] = \ qids.full_where_has_been_seen(chosen_q[0][qs.QRY_QID], chosen_q[0][qs.QRY_WHERECLAUSE]) queries.append(chosen_q) #capping at choose-num number of queries self.refined_queries_results = queries
def _generate_queries(self): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for (field, dist) in self.__dists.iteritems(): for q_template in self.__queries: query_dicts = [] for count in xrange(q_template['no_queries']*\ OVER_GENERATION_RATIO): self.__count += 1 LOGGER.info('P9: Created %d out of %d queries' % \ (self.__count, self.__total)) lrss = (q_template[qs.QRY_LRSS]/self.__db_size) urss = (q_template[qs.QRY_URSS]/self.__db_size) (word_one, word_two) = dist.generate_alarmword(lrss, urss) where = 'WORD_PROXIMITY(%s, \'\'%s\'\', \'\'%s\'\')' % \ (sv.sql_info[field][0], word_one, word_two) where_clause = "%s <= %d ORDER BY %s" % (where, q_template['distance'], where) qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid,where_clause): continue query_dicts.append({ qs.QRY_ENUM : qs.CAT.P9_ALARM_WORDS, qs.QRY_QID : qid, qs.QRY_CAT : self.__cat, qs.QRY_SUBCAT : 'alarm-words', qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_WHERECLAUSE : where_clause, qs.QRY_LRSS : q_template[qs.QRY_LRSS], qs.QRY_URSS : q_template[qs.QRY_URSS], qs.QRY_ALARMWORDONE : word_one, qs.QRY_ALARMWORDTWO : word_two, qs.QRY_ALARMWORDDISTANCE : q_template['distance']}) self.__bobs.append(aqb.AlarmQueryBatch(query_dicts, count, int(count/OVER_GENERATION_RATIO), True)) return self.__bobs
def _generate_range_queries(self, field, dist, q): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] num_queries = 0 for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): num_queries = count self.__count += 1 LOGGER.info('P2: Created %d out of %d queries' % (self.__count, self.__total)) r_lower_cdf = q['rss_lower_cdf'] r_upper_cdf = q['rss_upper_cdf'] (lower, upper) = dist.generate_double_range(r_lower_cdf, r_upper_cdf, db_size=self.__db_size) qid = qids.query_id() if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]: where_clause = '%s BETWEEN %s AND %s' % \ (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) else: where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % \ (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) if qid != qids.full_where_has_been_seen(qid, where_clause): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P2_RANGE, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'range', qs.QRY_WHERECLAUSE: where_clause, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_LBOUND: lower, qs.QRY_UBOUND: upper, qs.QRY_RANGE: 0 }) return aqb.RangeQueryBatch( query_dicts, num_queries, int((num_queries + 1) / OVER_GENERATION_RATIO), True)
def _generate_short_queries(self, dist, q): ''' Generates queries of the form .//LEAF ''' query_dicts = [] query_count = 0 for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 query_cout = count LOGGER.info('P11: Created %d out of %d queries' % \ (self.__count, self.__total)) r_lower = q[qs.QRY_LRSS] / (self.__db_size * xg.XML_DEPTH * xg.FAN_OUT) r_upper = q[qs.QRY_URSS] / (self.__db_size * xg.XML_DEPTH * xg.FAN_OUT) (field, value) = self._create_equality_leaf(dist, r_lower, r_upper) value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value) try: value = value.replace('\'', '\'\'') except TypeError: pass except AttributeError: pass if field in ['foo', 'age', 'income']: where = "xml_value(xml,\'//%s\', %s)" % (field, value) else: where = "xml_value(xml,\'//%s\', \'%s\')" % (field, value) xpath = field qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P11_SHORT, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'eq-double-slash', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: 'string', qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_VALUE: value, qs.QRY_XPATH: xpath }) return aqb.XmlQueryBatch( query_dicts, query_count, max(int((query_count + 1) / OVER_GENERATION_RATIO), 1), True)
def _than_queries(self, field, dist, pdf_lower, pdf_upper, range_type): query_dicts = [] for x in xrange(CLAUSE_OVER_GENERATION_RATIO): #generate the range specific aspects of the queries if range_type == 'greater': value = dist.generate_greater_than(pdf_lower, pdf_upper, db_size = self.__db_size) enum = qs.CAT.P2_GREATER tail = '' comp = '>=' else: value = dist.generate_less_than(pdf_lower, pdf_upper, db_size = self.__db_size) enum = qs.CAT.P2_LESS tail = TAIL comp = '<=' #generate the where clauses if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]: where_clause = '%s %s %s' % (sv.sql_info[field][0], comp, sv.VAR_CONVERTERS[field].to_csv(value)) elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]: where_clause = 'SUBSTR(%s,1,9) %s \'\'%s\'\'' % (sv.sql_info[field][0], comp, sv.VAR_CONVERTERS[field].to_csv(value)[:9]) value = value + tail else: where_clause = '%s %s \'\'%s\'\'' % (sv.sql_info[field][0], comp, sv.VAR_CONVERTERS[field].to_csv(value)) qid = qids.query_id() query_dicts.append({qs.QRY_ENUM : enum, qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'P2', qs.QRY_SUBCAT : range_type, qs.QRY_WHERECLAUSE : where_clause, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_VALUE : value, qs.QRY_RANGE : 0 }) return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def _equality_queries(self, field, dist): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 logger.info('EQ: Created %d out of %d queries' % (self.__count, self.__total)) r_lower_cdf = self.__queries[q]['rss_lower_cdf'] r_upper_cdf = self.__queries[q]['rss_upper_cdf'] value = self.__dists[field].generate_pdf( r_lower_cdf, r_upper_cdf, {}) qid = qids.query_id() (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where( field, value) if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.EQ, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: '', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_LRSS: self.__queries[q][qs.QRY_LRSS], qs.QRY_URSS: self.__queries[q][qs.QRY_URSS], qs.QRY_VALUE: value }) self.__bobs.append( aqb.EqualityQueryBatch( query_dicts, count, int( (count + 1) / OVER_GENERATION_RATIO), True))
def make_equality_queries(self, field, value): ''' creates equality query dictionaries ''' (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where( field, value) qid = qids.query_id() return { qs.QRY_ENUM: qs.CAT.EQ, qs.QRY_QID: qids.full_where_has_been_seen(qid, where), qs.QRY_DBNUMRECORDS: self._db_size, qs.QRY_DBRECORDSIZE: self._row_width, qs.QRY_CAT: 'EQ', qs.QRY_SUBCAT: '', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_VALUE: value }
def _range_queries(self, field, dist, pdf_lower, pdf_upper): query_dicts = [] tail = 'ZZZZZZZZZZZZZ' for x in xrange(CLAUSE_OVER_GENERATION_RATIO): (lower, upper) = dist.generate_double_range(pdf_lower, pdf_upper, db_size = self.__db_size) qid = qids.query_id() if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]: where_clause = '%s BETWEEN %s AND %s' % (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]: where_clause = 'SUBSTR(%s,1,9) BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower)[:9], sv.VAR_CONVERTERS[field].to_csv(upper)[:9]) lower = sv.VAR_CONVERTERS[field].to_csv(lower)[:9] upper = sv.VAR_CONVERTERS[field].to_csv(upper)[:9]+TAIL else: where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) query_dicts.append({qs.QRY_ENUM : qs.CAT.P2_RANGE, qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'P2', qs.QRY_SUBCAT : 'range', qs.QRY_WHERECLAUSE : where_clause, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_LBOUND : lower, qs.QRY_UBOUND : upper, qs.QRY_RANGE : 0 }) return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries: self.refined_queries_results = refined_queries for (comp_q, comp_q_results) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) #print out the query self._print_query(comp_q, query_file_handle) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] try: num_clauses = comp_q[qs.QRY_NUMCLAUSES] except KeyError: num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] working_clauses = None #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = sorted( clause_q_r, key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS])) #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] working_clauses = [] q_refined = False for clause in clause_q_r: #don't need to check permuations if ftm doesn't match if q_refined == True: continue ftm_match = len(clause[1][rdb.DBF_MATCHINGRECORDIDS]) if not all([ ftm_match >= qbs.get_tm_rss_lower( comp_q[qs.QRY_ENUM]), ftm_match <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #alright ftm matches, let's check the rest of the clauses for clause_set in itertools.combinations( clause_q_r, num_clauses - 1): #query has already been refined if q_refined == True: continue clause_list = [clause] + list(clause_set) #check to see if any of the clauses or their fields are the same #if so we know the intersection is one we are not interested in values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] fields = [ q[qs.QRY_FIELD] for ((q, _), _) in clause_list ] #there are duplicate values or this where has already been seen if len(values)!=len(set(values)) or\ len(fields)!=len(set(fields)) or\ values in seen_where_group: continue seen_where_group.append(values) #check conditions matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in clause_list ]) count = len(matching_ids_set) if not all([ count >= qbs.get_rss_lower( comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]), ftm_match >= qbs.get_tm_rss_lower( comp_q[qs.QRY_ENUM]), ftm_match <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #this combination worked, so don't need to refine further for this #particular query q_refined = True refined_total += 1 #reorder clauses working_clauses = clause_list reordered_clauses = working_clauses[:1] working_clauses.remove(reordered_clauses[0]) cumulative_set = set( reordered_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) while len(working_clauses) > 0: next_clause = working_clauses[0] current_set = cumulative_set.intersection( working_clauses[0][1][ rdb.DBF_MATCHINGRECORDIDS]) for clauses in working_clauses: potential_set = cumulative_set.intersection( clauses[1][rdb.DBF_MATCHINGRECORDIDS]) if len(potential_set) < len(current_set): next_clause = clauses current_set = potential_set working_clauses.remove(next_clause) reordered_clauses.append(next_clause) cumulative_set = current_set working_clauses = reordered_clauses #update query with chosen clauses whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses) comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] ftm_match = len( working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in working_clauses ]) comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match #get the id's lined up comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[ qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #write the results to the results database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) #print out the query self._print_query(comp_q, query_file_handle) refined_queries.append((comp_q, comp_q_results)) logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if q_refined == True: logger.info( "WORKING QUERY INFORMATION qid = %d, where_clause = %s, ftm = %d, rss = %d" % (comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE], ftm_match, count)) #capping at choose-num number of queries self.refined_queries_results = refined_queries
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries: self.refined_queries_results = refined_queries for (comp_q, comp_q_results) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) #print out the query self._print_query(comp_q, query_file_handle) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] num_clauses = comp_q[qs.QRY_NUMCLAUSES] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] working_clauses = None #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r if len(r[rdb.DBF_MATCHINGRECORDIDS]) <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])] if len(clause_q_r) < num_clauses: continue #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] working_clauses = [] q_refined = False for clause_set in itertools.combinations( clause_q_r, num_clauses): #query has already been refined if q_refined == True: continue clause_list = list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] #there are duplicate values or this where has already been seen if len(values)!=len(set(values)) or\ values in seen_where_group: continue seen_where_group.append(values) #check conditions matching_ids_set = reduce(set.union, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in clause_list ]) count = len(matching_ids_set) all_match = sum( map(len, [ r[rdb.DBF_MATCHINGRECORDIDS] for (_, r) in clause_list ])) if not all([ count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]), all_match >= qbs.get_tm_rss_lower( comp_q[qs.QRY_ENUM]), all_match <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #this combination worked, so don't need to refine further for this #particular query q_refined = True refined_total += 1 working_clauses = clause_list #update query with chosen clauses whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] comp_q[qs.QRY_WHERECLAUSE] = " OR ".join(whereclauses) comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] ftm_match = len( working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_SUMRECORDSMATCHINGEACHTERM] = all_match comp_q_results[ qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match #make sure duplicate queries (and their atomic sub_components) have the same qids comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #create result objects and write to ground truth database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) refined_queries.append((comp_q, comp_q_results)) #print query self._print_query(comp_q, query_file_handle) #make where clause, update and with chosen queries and the aggregator results #with the chosen results logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], all_match, count)) self.refined_queries_results = refined_queries
def _generate_full_queries(self, dist, q): ''' Generates queries of the form ./node1/node2/LEAF ''' query_dicts = [] for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 LOGGER.info('P11: Created %d out of %d queries' % \ (self.__count, self.__total)) r_lower_total = q[qs.QRY_LRSS] / self.__db_size r_upper_total = q[qs.QRY_URSS] / self.__db_size branch_r_lower = pow(r_lower_total / xg.XML_DEPTH, 1.0 / (xg.XML_DEPTH)) branch_r_upper = pow(r_upper_total / xg.XML_DEPTH, 1.0 / (xg.XML_DEPTH)) tags = [] for level in xrange(xg.XML_DEPTH - 1): tags.append( dist.generate_node_pdf(level, branch_r_lower, branch_r_upper)) tag_string = '' for tag in tags: tag_string += "/%s" % (tag) (field, value) = self._create_equality_leaf(dist, branch_r_lower, branch_r_upper) value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value) try: value = value.replace('\'', '\'\'') except TypeError: pass except AttributeError: pass if field in ['foo', 'age', 'income']: where = "xml_value(xml,\'/xml%s/%s\',%s)" % (tag_string, field, value) else: where = "xml_value(xml,\'/xml%s/%s\',\'%s\')" % (tag_string, field, value) xpath = ['xml'] + tags xpath.append(field) qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P11_FULL, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'eq-full', qs.QRY_PERF: self.__perf, qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: 'string', qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_VALUE: value, qs.QRY_XPATH: xpath }) return aqb.XmlQueryBatch( query_dicts, count, max(int((count + 1) / OVER_GENERATION_RATIO), 1), True)
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries != None: self.refined_queries_results = refined_queries for (q, r) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) self._print_query(q, query_file_handle) try: q[qs.QRY_PERF].remove('IBM1') except ValueError: pass q[qs.QRY_WHERECLAUSE] = q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\ q[qs.QRY_WHERECLAUSE] + " DESC" q[qs.QRY_ENUM] = qs.CAT.P9_EQ q[qs.QRY_CAT] = 'P9' q[qs.QRY_QID] = qids.full_where_has_been_seen( qids.query_id(), q[qs.QRY_WHERECLAUSE]) r[qs.QRY_QID] = q[qs.QRY_QID] qr.QueryResultBase.write_to_full_to_atomic_table( q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) self._print_query(q, query_file_handle) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = sorted( clause_q_r, key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS])) #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] comp_q_refined = False for clause_set in itertools.combinations( clause_q_r, num_clauses): if comp_q_refined == True: continue clause_list = list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] if len(values) != len( set(values)) or values in seen_where_group: continue seen_where_group.append(values) #check to see if it is working #if stfm doesn't match, don't bother continuing stfm = 0 for offset in xrange(comp_q[qs.QRY_N] - comp_q[qs.QRY_M] + 1): (_, r) = clause_list[offset] stfm += len(r[rdb.DBF_MATCHINGRECORDIDS]) if not all([ stfm >= qbs.get_tm_rss_lower(comp_q[qs.QRY_ENUM]), stfm <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #if stfm does match, calculate the set intersection matching_ids_set = set() for m_set in itertools.combinations( clause_list, comp_q[qs.QRY_M]): matching_ids_set.update( reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in m_set ])) count = len(matching_ids_set) #check overall compliance if not all([ count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue comp_q_refined = True refined_total += 1 ##PROCESSING THE WORKING CLAUSE_LIST working_clauses = clause_list whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] where = ", ".join(whereclauses) where = 'M_OF_N(%d, %d, %s)' % (comp_q[qs.QRY_M], comp_q[qs.QRY_N], where) #update query with chosen clauses comp_q[qs.QRY_WHERECLAUSE] = where comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] #have to create a list of counts of how many that match N terms, n-1 terms... #until m. Such of the form 34 | 384 | 1094 records_matching_count = dict( zip(range(comp_q[qs.QRY_M], comp_q[qs.QRY_N] + 1), [0] * comp_q[qs.QRY_N])) for id in matching_ids_set: matching_terms = [ 1 if id in clause[1][rdb.DBF_MATCHINGRECORDIDS] else 0 for clause in working_clauses ] term_matches = sum(matching_terms) records_matching_count[term_matches] += 1 matching_records_counts = sorted( records_matching_count.values(), reverse=True) #update the results dictionary with the new calculated values comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_MATCHINGRECORDCOUNTS] = matching_records_counts #make sure duplicate queries (and their atomic sub_components) have the same qids comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #write queries to the results database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) self._print_query(comp_q, query_file_handle) try: comp_q[qs.QRY_PERF].remove('IBM1') except ValueError: pass comp_q[qs.QRY_WHERECLAUSE] = comp_q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\ comp_q[qs.QRY_WHERECLAUSE] + " DESC" comp_q[qs.QRY_ENUM] = qs.CAT.P9_EQ comp_q[qs.QRY_CAT] = 'P9' comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( qids.query_id(), comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) self._print_query(comp_q, query_file_handle) refined_queries.append((comp_q, comp_q_results)) logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if comp_q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], stfm, count)) self.refined_queries_results = refined_queries
def produce_query_batches(self): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ self.bobs = [] query_dicts = [] for f in xrange(len(self.__dists)): for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): for r in xrange(self.__queries[q][qs.QRY_RANGEEXPL], self.__queries[q][qs.QRY_RANGEEXPU] + 1): self.__count += 1 logger.debug('P2-foo: Created %d out of %d queries' % (self.__count, self.__total)) field = sv.sql_info[self.__fields[f]][0] rss_lower = self.__queries[q][qs.QRY_LRSS] rss_upper = self.__queries[q][qs.QRY_URSS] rss_avg = (rss_lower + rss_upper) / 2 range = 2**r if self.__queries[q][qs.QRY_TYPE] == 'range': try: (lower, upper) = self.__dists[ self.__fields[f]].generate_two_sided( rss_avg, range, self.__db_size) except bd.FooInputs: (lower, upper) = (0, 0) enum = qs.CAT.P2_RANGE_FOO where_clause = '%s BETWEEN %d AND %d' % ( field, lower, upper) else: try: lower = self.__dists[ self.__fields[f]].generate_greater_than( rss_avg, self.__db_size) except bd.FooInputs: lower = 0 upper = 2**64 - 1 enum = qs.CAT.P2_GREATER_FOO where_clause = '%s >= %d' % (field, lower) qid = qids.query_id() if qid != qids.full_where_has_been_seen( qid, where_clause): continue query_dicts.append({ qs.QRY_ENUM: enum, qs.QRY_QID: qid, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: self.__queries[q][qs.QRY_TYPE], qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_WHERECLAUSE: where_clause, qs.QRY_FIELD: field, qs.QRY_FIELDTYPE: sv.sql_info[self.__fields[f]][1], qs.QRY_TYPE: self.__queries[q][qs.QRY_TYPE], qs.QRY_LRSS: rss_lower, qs.QRY_URSS: rss_upper, qs.QRY_RANGEEXP: r, qs.QRY_LBOUND: lower, qs.QRY_UBOUND: upper, qs.QRY_RANGE: upper - lower }) self.bobs.append( aqb.FooRangeQueryBatch( query_dicts, len(query_dicts), int(len(query_dicts) / OVER_GENERATION_RATIO), True)) return self.bobs
def testFullWhere(self): self.assertEqual(qids.full_where_has_been_seen(1, "it"), 1) self.assertEqual(qids.full_where_has_been_seen(2, "it"), 1) qids.reset_full_where() self.assertEqual(qids.full_where_has_been_seen(1, "it"), 1)
def produce_query_batches(self): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ self.bobs = [] query_dicts = [] for f in xrange(len(self.__dists)): for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 logger.info('P3/P4: Created %d out of %d queries' % (self.__count, self.__total)) field = self.__fields[f] r_u_pdf = self.__queries[q]['rss_u_pdf'] r_l_pdf = self.__queries[q]['rss_l_pdf'] kw_len = self.__queries[q][qs.QRY_KEYWORDLEN] if self.__queries[q][qs.QRY_TYPE] == 'word': enum = qs.CAT.P3 value = self.__dists[field].generate_word( kw_len, r_l_pdf, r_u_pdf) where_clause = 'CONTAINED_IN(%s, \'\'%s\'\')' % ( sv.sql_info[field][0], value.replace( '\'', "\\'").lower()) else: enum = qs.CAT.P4 (value, word) = self.__dists[field].generate_antistem( kw_len, r_l_pdf, r_u_pdf) where_clause = 'CONTAINS_STEM(%s, \'\'%s\'\')' % ( sv.sql_info[field][0], word.replace('\'', "\\'").lower()) try: RSS = self.__queries[q][qs.QRY_RSS] LRSS = int(self.__queries[q][qs.QRY_RSS] * 1.1) URSS = int(self.__queries[q][qs.QRY_RSS] / 1.1) except KeyError: RSS = (self.__queries[q][qs.QRY_LRSS] + self.__queries[q][qs.QRY_URSS]) / 2.0 LRSS = self.__queries[q][qs.QRY_LRSS] URSS = self.__queries[q][qs.QRY_URSS] qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where_clause): continue query_dicts.append({ qs.QRY_ENUM: enum, qs.QRY_QID: qid, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: '', qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_WHERECLAUSE: where_clause, qs.QRY_TYPE: self.__queries[q][qs.QRY_TYPE], qs.QRY_RSS: RSS, qs.QRY_LRSS: LRSS, qs.QRY_URSS: URSS, qs.QRY_KEYWORDLEN: kw_len, qs.QRY_SEARCHFOR: value }) self.bobs.append( aqb.KeywordQueryBatch( query_dicts, count, int((count + 1) / OVER_GENERATION_RATIO), True)) return self.bobs
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries: self.refined_queries_results = refined_queries else: refined_queries = [] refined_total = 0 for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] try: num_clauses = comp_q[qs.QRY_NUMCLAUSES] except KeyError: num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] working_clauses = None #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r if r[qs.QRY_VALID]] #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] working_clauses = [] q_refined = False for clause in clause_q_r: for clause_set in itertools.combinations( clause_q_r, num_clauses - 1): if q_refined == True: continue clause_list = [clause] + list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] if len(values) != len( set(values)) or values in seen_where_group: continue seen_where_group.append(values) matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in clause_list ]) count = len(matching_ids_set) P2_cats = [ q for ((q, _), _) in clause_list if q[qs.QRY_CAT] == 'P2' ] if not all([ count >= qbs.get_rss_lower( comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]), len(P2_cats) <= 1 ]): continue #this combination worked, so don't need to refine further for this #particular query q_refined = True refined_total += 1 working_clauses = clause_list #reorder clauses re_ordered_clauses = [] last_clause = None for ((q, b), r) in working_clauses: if q[qs.QRY_CAT] == 'P2': last_clause = ((q, b), r) else: re_ordered_clauses.append(((q, b), r)) if last_clause: re_ordered_clauses.append(last_clause) working_clauses = re_ordered_clauses #update query with chosen clauses whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses) comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] ftm_match = len( working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS]) matching_ids_set = reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in working_clauses ]) comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match refined_queries.append((comp_q, comp_q_results)) #make where clause, update and with chosen queries and the aggregator results #with the chosen results logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, ftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], ftm_match, count)) for (q, r) in refined_queries: q[qs.QRY_QID] = qids.full_where_has_been_seen( q[qs.QRY_QID], q[qs.QRY_WHERECLAUSE]) r[qs.QRY_QID] = q[qs.QRY_QID] for (sub_q, sub_r) in zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #capping at choose-num number of queries self.refined_queries_results = refined_queries #create result objects and write to ground truth database for (q, r) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table(q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) #writing queries in sql format to file for (q, _) in self.refined_queries_results: if q != None: self._print_query(q, query_file_handle)