def _equality_queries(self, field, dist, pdf_lower, pdf_upper): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for x in xrange(CLAUSE_OVER_GENERATION_RATIO): value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {}) qid = qids.query_id() (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where(field, value) query_dicts.append({qs.QRY_ENUM : qs.CAT.EQ, qs.QRY_QID : qids.full_where_has_been_seen(qid, where), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'EQ', qs.QRY_SUBCAT : '', qs.QRY_WHERECLAUSE : where, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_VALUE : value}) return aqb.EqualityQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def _string_equality_queries(self, field, dist, pdf_lower, pdf_upper): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for _ in xrange(CLAUSE_OVER_GENERATION_RATIO): value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {}) qid = qids.query_id() where = 'SUBSTR(%s,1,9) = \'\'%s\'\'' % (sv.sql_info[field][0], value[:9]) query_dicts.append({qs.QRY_ENUM : qs.CAT.P7_FINAL, qs.QRY_QID : qids.full_where_has_been_seen(qid, where), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'P7', qs.QRY_SUBCAT : 'final', qs.QRY_WHERECLAUSE : where, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_KEYWORDLEN : 9, qs.QRY_SEARCHFOR : value[:9], qs.QRY_SEARCHDELIMNUM : 1 }) return aqb.WildcardQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def compound_eq(self, q): sub_bobs = [] pdf_lower_total = q[qs.QRY_LRSS]/self.__db_size pdf_upper_total = q[qs.QRY_URSS]/self.__db_size pdf_lower = pow(pdf_lower_total,1.0/len(self.__dists)) pdf_upper = pow(pdf_upper_total,1.0/len(self.__dists)) for (f, dist) in self.__dists.iteritems(): if f in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]: sub_bobs.append(self._string_equality_queries(f, dist, pdf_lower, pdf_upper)) else: sub_bobs.append(self._equality_queries(f, dist, pdf_lower, pdf_upper)) return {qs.QRY_ENUM : qs.CAT.P1_EQ_AND, qs.QRY_QID : qids.query_id(), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : self.__cat, qs.QRY_SUBCAT : 'eq-and' if len(self.__dists) > 1 else '', qs.QRY_WHERECLAUSE : '', qs.QRY_NEGATE : False, qs.QRY_LRSS : q['r_lower'], qs.QRY_URSS : q['r_upper'], qs.QRY_NUMCLAUSES : len(self.__dists), qs.QRY_NUMTERMSPERCLAUSE : 1, qs.QRY_FTMLOWER : 1, qs.QRY_FTMUPPER : self.__db_size, qs.QRY_SUBBOBS : sub_bobs}
def process_atomic_wheres(atomic_where_clauses, resultdb, record_size, num_records, qid): for atomic_where_clause in atomic_where_clauses: atomic_where_clause = atomic_where_clause.strip() cmd = "select aqid from atomic_queries where where_clause = \"" + atomic_where_clause + "\"" resultdb._execute(cmd) aqid = resultdb._fetchall() if aqid == []: afield = atomic_where_clause.split( )[0].strip(); afield_type = FIELD_TO_TYPE[afield] atomic_where_aqid = qids.query_id() cmd = "insert into atomic_queries (where_clause, db_record_size,"+\ " db_num_records, category, field_type, field, num_matching_records, aqid) values "+\ "(\""+str(atomic_where_clause)+"\", "+str(record_size)+", "+str(num_records)+", \"EQ\",\""+str(afield_type)+"\",\""+str(afield)+\ "\", 1, "+str(atomic_where_aqid)+")" print cmd resultdb._execute(cmd) atomic_ids = resultdb._fetchall() else: atomic_where_aqid = aqid[0][0] #rebuild mapping between full and atomic queries cmd = "insert into full_to_atomic_junction (full_row_id, atomic_row_id) values ("+\ str(qid)+","+str(atomic_where_aqid)+")" print cmd resultdb._execute(cmd) null = resultdb._fetchall()
def _generate_queries(self): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for (field, dist) in self.__dists.iteritems(): for q_template in self.__queries: query_dicts = [] for count in xrange(q_template['no_queries']*\ OVER_GENERATION_RATIO): self.__count += 1 LOGGER.info('P9: Created %d out of %d queries' % \ (self.__count, self.__total)) lrss = (q_template[qs.QRY_LRSS]/self.__db_size) urss = (q_template[qs.QRY_URSS]/self.__db_size) (word_one, word_two) = dist.generate_alarmword(lrss, urss) where = 'WORD_PROXIMITY(%s, \'\'%s\'\', \'\'%s\'\')' % \ (sv.sql_info[field][0], word_one, word_two) where_clause = "%s <= %d ORDER BY %s" % (where, q_template['distance'], where) qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid,where_clause): continue query_dicts.append({ qs.QRY_ENUM : qs.CAT.P9_ALARM_WORDS, qs.QRY_QID : qid, qs.QRY_CAT : self.__cat, qs.QRY_SUBCAT : 'alarm-words', qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_WHERECLAUSE : where_clause, qs.QRY_LRSS : q_template[qs.QRY_LRSS], qs.QRY_URSS : q_template[qs.QRY_URSS], qs.QRY_ALARMWORDONE : word_one, qs.QRY_ALARMWORDTWO : word_two, qs.QRY_ALARMWORDDISTANCE : q_template['distance']}) self.__bobs.append(aqb.AlarmQueryBatch(query_dicts, count, int(count/OVER_GENERATION_RATIO), True)) return self.__bobs
def _generate_range_queries(self, field, dist, q): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] num_queries = 0 for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): num_queries = count self.__count += 1 LOGGER.info('P2: Created %d out of %d queries' % (self.__count, self.__total)) r_lower_cdf = q['rss_lower_cdf'] r_upper_cdf = q['rss_upper_cdf'] (lower, upper) = dist.generate_double_range(r_lower_cdf, r_upper_cdf, db_size=self.__db_size) qid = qids.query_id() if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]: where_clause = '%s BETWEEN %s AND %s' % \ (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) else: where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % \ (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) if qid != qids.full_where_has_been_seen(qid, where_clause): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P2_RANGE, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'range', qs.QRY_WHERECLAUSE: where_clause, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_LBOUND: lower, qs.QRY_UBOUND: upper, qs.QRY_RANGE: 0 }) return aqb.RangeQueryBatch( query_dicts, num_queries, int((num_queries + 1) / OVER_GENERATION_RATIO), True)
def _generate_short_queries(self, dist, q): ''' Generates queries of the form .//LEAF ''' query_dicts = [] query_count = 0 for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 query_cout = count LOGGER.info('P11: Created %d out of %d queries' % \ (self.__count, self.__total)) r_lower = q[qs.QRY_LRSS] / (self.__db_size * xg.XML_DEPTH * xg.FAN_OUT) r_upper = q[qs.QRY_URSS] / (self.__db_size * xg.XML_DEPTH * xg.FAN_OUT) (field, value) = self._create_equality_leaf(dist, r_lower, r_upper) value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value) try: value = value.replace('\'', '\'\'') except TypeError: pass except AttributeError: pass if field in ['foo', 'age', 'income']: where = "xml_value(xml,\'//%s\', %s)" % (field, value) else: where = "xml_value(xml,\'//%s\', \'%s\')" % (field, value) xpath = field qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P11_SHORT, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'eq-double-slash', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: 'string', qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_VALUE: value, qs.QRY_XPATH: xpath }) return aqb.XmlQueryBatch( query_dicts, query_count, max(int((query_count + 1) / OVER_GENERATION_RATIO), 1), True)
def _than_queries(self, field, dist, pdf_lower, pdf_upper, range_type): query_dicts = [] for x in xrange(CLAUSE_OVER_GENERATION_RATIO): #generate the range specific aspects of the queries if range_type == 'greater': value = dist.generate_greater_than(pdf_lower, pdf_upper, db_size = self.__db_size) enum = qs.CAT.P2_GREATER tail = '' comp = '>=' else: value = dist.generate_less_than(pdf_lower, pdf_upper, db_size = self.__db_size) enum = qs.CAT.P2_LESS tail = TAIL comp = '<=' #generate the where clauses if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]: where_clause = '%s %s %s' % (sv.sql_info[field][0], comp, sv.VAR_CONVERTERS[field].to_csv(value)) elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]: where_clause = 'SUBSTR(%s,1,9) %s \'\'%s\'\'' % (sv.sql_info[field][0], comp, sv.VAR_CONVERTERS[field].to_csv(value)[:9]) value = value + tail else: where_clause = '%s %s \'\'%s\'\'' % (sv.sql_info[field][0], comp, sv.VAR_CONVERTERS[field].to_csv(value)) qid = qids.query_id() query_dicts.append({qs.QRY_ENUM : enum, qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'P2', qs.QRY_SUBCAT : range_type, qs.QRY_WHERECLAUSE : where_clause, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_VALUE : value, qs.QRY_RANGE : 0 }) return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def _equality_queries(self, field, dist): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ query_dicts = [] for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 logger.info('EQ: Created %d out of %d queries' % (self.__count, self.__total)) r_lower_cdf = self.__queries[q]['rss_lower_cdf'] r_upper_cdf = self.__queries[q]['rss_upper_cdf'] value = self.__dists[field].generate_pdf( r_lower_cdf, r_upper_cdf, {}) qid = qids.query_id() (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where( field, value) if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.EQ, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: '', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_LRSS: self.__queries[q][qs.QRY_LRSS], qs.QRY_URSS: self.__queries[q][qs.QRY_URSS], qs.QRY_VALUE: value }) self.__bobs.append( aqb.EqualityQueryBatch( query_dicts, count, int( (count + 1) / OVER_GENERATION_RATIO), True))
def _equality_fishing_queries(self, f, dist): """ Creates the logic for creating queries for fields with high entropy that cannot be generated before hand """ query_dicts = [] for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 logger.info('EQ: Created %d out of %d queries' % (self.__count, self.__total)) field = f r_lower_cdf = self.__queries[q]['rss_lower_cdf'] r_upper_cdf = self.__queries[q]['rss_upper_cdf'] (search_value, lbound, ubound, enum) = \ self._get_fishing_values(f, dist, r_lower_cdf, r_upper_cdf) query_dicts.append({ qs.QRY_ENUM: enum, qs.QRY_QID: qids.query_id(), qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: '', qs.QRY_WHERECLAUSE: '', qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_LRSS: self.__queries[q][qs.QRY_LRSS], qs.QRY_URSS: self.__queries[q][qs.QRY_URSS], qs.QRY_VALUE: None, qs.QRY_SEARCHFOR: search_value, qs.QRY_LBOUND: lbound, qs.QRY_UBOUND: ubound }) self.__bobs.append( aqb.EqualityFishingQueryBatch( query_dicts, count, int( (count + 1) / OVER_GENERATION_RATIO), True))
def make_query_template(self, q, bob): return { qs.QRY_ENUM: self.enum(), qs.QRY_QID: qids.query_id(), qs.QRY_DBNUMRECORDS: self._db_size, qs.QRY_DBRECORDSIZE: self._row_width, qs.QRY_PERF: self._perf, qs.QRY_CAT: self._cat, qs.QRY_SUBCAT: 'eq', qs.QRY_WHERECLAUSE: '', qs.QRY_NEGATE: False, qs.QRY_LRSS: q['r_lower'], qs.QRY_URSS: q['r_upper'], qs.QRY_FTMLOWER: q['tm_lower'], qs.QRY_FTMUPPER: q['tm_upper'], qs.QRY_N: q['n'], qs.QRY_M: q['m'], qs.QRY_SUBBOBS: bob }
def make_equality_queries(self, field, value): ''' creates equality query dictionaries ''' (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where( field, value) qid = qids.query_id() return { qs.QRY_ENUM: qs.CAT.EQ, qs.QRY_QID: qids.full_where_has_been_seen(qid, where), qs.QRY_DBNUMRECORDS: self._db_size, qs.QRY_DBRECORDSIZE: self._row_width, qs.QRY_CAT: 'EQ', qs.QRY_SUBCAT: '', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_VALUE: value }
def _range_queries(self, field, dist, pdf_lower, pdf_upper): query_dicts = [] tail = 'ZZZZZZZZZZZZZ' for x in xrange(CLAUSE_OVER_GENERATION_RATIO): (lower, upper) = dist.generate_double_range(pdf_lower, pdf_upper, db_size = self.__db_size) qid = qids.query_id() if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]: where_clause = '%s BETWEEN %s AND %s' % (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]: where_clause = 'SUBSTR(%s,1,9) BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower)[:9], sv.VAR_CONVERTERS[field].to_csv(upper)[:9]) lower = sv.VAR_CONVERTERS[field].to_csv(lower)[:9] upper = sv.VAR_CONVERTERS[field].to_csv(upper)[:9]+TAIL else: where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], sv.VAR_CONVERTERS[field].to_csv(lower), sv.VAR_CONVERTERS[field].to_csv(upper)) query_dicts.append({qs.QRY_ENUM : qs.CAT.P2_RANGE, qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause), qs.QRY_DBNUMRECORDS : self.__db_size, qs.QRY_DBRECORDSIZE : self.__row_width, qs.QRY_PERF : self.__perf, qs.QRY_CAT : 'P2', qs.QRY_SUBCAT : 'range', qs.QRY_WHERECLAUSE : where_clause, qs.QRY_FIELD : sv.sql_info[field][0], qs.QRY_NEGATE : False, qs.QRY_FIELDTYPE : sv.sql_info[field][1], qs.QRY_LBOUND : lower, qs.QRY_UBOUND : upper, qs.QRY_RANGE : 0 }) return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO, 1, False)
def process_results(self, agg_results, db_object, query_file_handle, refined_queries=None): """ Takes in the aggregator results, with those results, determines which queries in the batch are 'interesting' it then instantiates query_results for those queries and uses it to write it to the results database. Refine arguement is a list of already refined queries if the user does not wish to rely on the pre-defined refine queries function """ #refine queries if not already refined. if refined_queries != None: self.refined_queries_results = refined_queries for (q, r) in self.refined_queries_results: qr.QueryResultBase.write_to_full_to_atomic_table( q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) self._print_query(q, query_file_handle) try: q[qs.QRY_PERF].remove('IBM1') except ValueError: pass q[qs.QRY_WHERECLAUSE] = q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\ q[qs.QRY_WHERECLAUSE] + " DESC" q[qs.QRY_ENUM] = qs.CAT.P9_EQ q[qs.QRY_CAT] = 'P9' q[qs.QRY_QID] = qids.full_where_has_been_seen( qids.query_id(), q[qs.QRY_WHERECLAUSE]) r[qs.QRY_QID] = q[qs.QRY_QID] qr.QueryResultBase.write_to_full_to_atomic_table( q, r, db_object) qr.QueryResultBase.write_to_full_table(q, r, db_object) self._print_query(q, query_file_handle) q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(q['sub_queries'], r[qs.QRY_SUBRESULTS])) else: refined_total = 0 refined_queries = [] for x in xrange(len(self.queries)): comp_q = self.queries[x] sub_results = agg_results[qs.QRY_SUBRESULTS] num_clauses = comp_q[qs.QRY_N] sub_bobs = comp_q[qs.QRY_SUBBOBS] clause_q_b = [] #create the list of possible queries that can make up the clauses #(they are also paired with the bobs that create them) for b in sub_bobs: clause_q = b.produce_queries() clause_q_b += [(q, b) for q in clause_q] clause_r = [] #create list of results that go with those queries for (q, _) in clause_q_b: clause_r.append(sub_results[self.result_to_agg_map[q[ qs.QRY_WHERECLAUSE]]]) comp_q_results = {qs.QRY_SUBRESULTS: clause_r} #create a list of queries, their bobs, and their results clause_q_r = zip(clause_q_b, clause_r) clause_q_r = sorted( clause_q_r, key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS])) #try all possible cominbations of the queries to test if any #have the correct combinations to match the required ftm and ress seen_where_group = [] comp_q_refined = False for clause_set in itertools.combinations( clause_q_r, num_clauses): if comp_q_refined == True: continue clause_list = list(clause_set) values = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list ] if len(values) != len( set(values)) or values in seen_where_group: continue seen_where_group.append(values) #check to see if it is working #if stfm doesn't match, don't bother continuing stfm = 0 for offset in xrange(comp_q[qs.QRY_N] - comp_q[qs.QRY_M] + 1): (_, r) = clause_list[offset] stfm += len(r[rdb.DBF_MATCHINGRECORDIDS]) if not all([ stfm >= qbs.get_tm_rss_lower(comp_q[qs.QRY_ENUM]), stfm <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue #if stfm does match, calculate the set intersection matching_ids_set = set() for m_set in itertools.combinations( clause_list, comp_q[qs.QRY_M]): matching_ids_set.update( reduce(set.intersection, [ set(r[rdb.DBF_MATCHINGRECORDIDS]) for (_, r) in m_set ])) count = len(matching_ids_set) #check overall compliance if not all([ count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]), count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]) ]): continue comp_q_refined = True refined_total += 1 ##PROCESSING THE WORKING CLAUSE_LIST working_clauses = clause_list whereclauses = [ q[qs.QRY_WHERECLAUSE] for ((q, _), _) in working_clauses ] where = ", ".join(whereclauses) where = 'M_OF_N(%d, %d, %s)' % (comp_q[qs.QRY_M], comp_q[qs.QRY_N], where) #update query with chosen clauses comp_q[qs.QRY_WHERECLAUSE] = where comp_q['sub_queries'] = [ q for ((q, _), _) in working_clauses ] comp_q[qs.QRY_SUBBOBS] = [ b for ((_, b), _) in working_clauses ] #have to create a list of counts of how many that match N terms, n-1 terms... #until m. Such of the form 34 | 384 | 1094 records_matching_count = dict( zip(range(comp_q[qs.QRY_M], comp_q[qs.QRY_N] + 1), [0] * comp_q[qs.QRY_N])) for id in matching_ids_set: matching_terms = [ 1 if id in clause[1][rdb.DBF_MATCHINGRECORDIDS] else 0 for clause in working_clauses ] term_matches = sum(matching_terms) records_matching_count[term_matches] += 1 matching_records_counts = sorted( records_matching_count.values(), reverse=True) #update the results dictionary with the new calculated values comp_q_results[qs.QRY_SUBRESULTS] = [ r for (_, r) in working_clauses ] comp_q_results[ rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set comp_q_results[ qs.QRY_MATCHINGRECORDCOUNTS] = matching_records_counts #make sure duplicate queries (and their atomic sub_components) have the same qids comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] for (sub_q, sub_r) in zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS]): sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen( sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE]) sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID] #write queries to the results database qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) self._print_query(comp_q, query_file_handle) try: comp_q[qs.QRY_PERF].remove('IBM1') except ValueError: pass comp_q[qs.QRY_WHERECLAUSE] = comp_q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\ comp_q[qs.QRY_WHERECLAUSE] + " DESC" comp_q[qs.QRY_ENUM] = qs.CAT.P9_EQ comp_q[qs.QRY_CAT] = 'P9' comp_q[qs.QRY_QID] = qids.full_where_has_been_seen( qids.query_id(), comp_q[qs.QRY_WHERECLAUSE]) comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID] qr.QueryResultBase.write_to_full_to_atomic_table( comp_q, comp_q_results, db_object) qr.QueryResultBase.write_to_full_table( comp_q, comp_q_results, db_object) comp_q[qs.QRY_SUBBOBS][0].process_results( None, db_object, query_file_handle, zip(comp_q['sub_queries'], comp_q_results[qs.QRY_SUBRESULTS])) self._print_query(comp_q, query_file_handle) refined_queries.append((comp_q, comp_q_results)) logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" % (x, len(self.queries), refined_total)) if comp_q_refined == True: logger.info( "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d" % (comp_q[qs.QRY_WHERECLAUSE], stfm, count)) self.refined_queries_results = refined_queries
def _generate_full_queries(self, dist, q): ''' Generates queries of the form ./node1/node2/LEAF ''' query_dicts = [] for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 LOGGER.info('P11: Created %d out of %d queries' % \ (self.__count, self.__total)) r_lower_total = q[qs.QRY_LRSS] / self.__db_size r_upper_total = q[qs.QRY_URSS] / self.__db_size branch_r_lower = pow(r_lower_total / xg.XML_DEPTH, 1.0 / (xg.XML_DEPTH)) branch_r_upper = pow(r_upper_total / xg.XML_DEPTH, 1.0 / (xg.XML_DEPTH)) tags = [] for level in xrange(xg.XML_DEPTH - 1): tags.append( dist.generate_node_pdf(level, branch_r_lower, branch_r_upper)) tag_string = '' for tag in tags: tag_string += "/%s" % (tag) (field, value) = self._create_equality_leaf(dist, branch_r_lower, branch_r_upper) value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value) try: value = value.replace('\'', '\'\'') except TypeError: pass except AttributeError: pass if field in ['foo', 'age', 'income']: where = "xml_value(xml,\'/xml%s/%s\',%s)" % (tag_string, field, value) else: where = "xml_value(xml,\'/xml%s/%s\',\'%s\')" % (tag_string, field, value) xpath = ['xml'] + tags xpath.append(field) qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P11_FULL, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'eq-full', qs.QRY_PERF: self.__perf, qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: 'string', qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_VALUE: value, qs.QRY_XPATH: xpath }) return aqb.XmlQueryBatch( query_dicts, count, max(int((count + 1) / OVER_GENERATION_RATIO), 1), True)
def produce_query_batches(self): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ self.bobs = [] query_dicts = [] for f in xrange(len(self.__dists)): for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): for r in xrange(self.__queries[q][qs.QRY_RANGEEXPL], self.__queries[q][qs.QRY_RANGEEXPU] + 1): self.__count += 1 logger.debug('P2-foo: Created %d out of %d queries' % (self.__count, self.__total)) field = sv.sql_info[self.__fields[f]][0] rss_lower = self.__queries[q][qs.QRY_LRSS] rss_upper = self.__queries[q][qs.QRY_URSS] rss_avg = (rss_lower + rss_upper) / 2 range = 2**r if self.__queries[q][qs.QRY_TYPE] == 'range': try: (lower, upper) = self.__dists[ self.__fields[f]].generate_two_sided( rss_avg, range, self.__db_size) except bd.FooInputs: (lower, upper) = (0, 0) enum = qs.CAT.P2_RANGE_FOO where_clause = '%s BETWEEN %d AND %d' % ( field, lower, upper) else: try: lower = self.__dists[ self.__fields[f]].generate_greater_than( rss_avg, self.__db_size) except bd.FooInputs: lower = 0 upper = 2**64 - 1 enum = qs.CAT.P2_GREATER_FOO where_clause = '%s >= %d' % (field, lower) qid = qids.query_id() if qid != qids.full_where_has_been_seen( qid, where_clause): continue query_dicts.append({ qs.QRY_ENUM: enum, qs.QRY_QID: qid, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: self.__queries[q][qs.QRY_TYPE], qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_WHERECLAUSE: where_clause, qs.QRY_FIELD: field, qs.QRY_FIELDTYPE: sv.sql_info[self.__fields[f]][1], qs.QRY_TYPE: self.__queries[q][qs.QRY_TYPE], qs.QRY_LRSS: rss_lower, qs.QRY_URSS: rss_upper, qs.QRY_RANGEEXP: r, qs.QRY_LBOUND: lower, qs.QRY_UBOUND: upper, qs.QRY_RANGE: upper - lower }) self.bobs.append( aqb.FooRangeQueryBatch( query_dicts, len(query_dicts), int(len(query_dicts) / OVER_GENERATION_RATIO), True)) return self.bobs
def produce_query_batches(self): """ This generates returns a query_batch object that holds the logic for creating aggregators for the queries, and also contains the logic for processing the results and printing the query """ self.bobs = [] query_dicts = [] for f in xrange(len(self.__dists)): for q in xrange(len(self.__queries)): query_dicts = [] for count in xrange(self.__queries[q]['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 logger.info('P3/P4: Created %d out of %d queries' % (self.__count, self.__total)) field = self.__fields[f] r_u_pdf = self.__queries[q]['rss_u_pdf'] r_l_pdf = self.__queries[q]['rss_l_pdf'] kw_len = self.__queries[q][qs.QRY_KEYWORDLEN] if self.__queries[q][qs.QRY_TYPE] == 'word': enum = qs.CAT.P3 value = self.__dists[field].generate_word( kw_len, r_l_pdf, r_u_pdf) where_clause = 'CONTAINED_IN(%s, \'\'%s\'\')' % ( sv.sql_info[field][0], value.replace( '\'', "\\'").lower()) else: enum = qs.CAT.P4 (value, word) = self.__dists[field].generate_antistem( kw_len, r_l_pdf, r_u_pdf) where_clause = 'CONTAINS_STEM(%s, \'\'%s\'\')' % ( sv.sql_info[field][0], word.replace('\'', "\\'").lower()) try: RSS = self.__queries[q][qs.QRY_RSS] LRSS = int(self.__queries[q][qs.QRY_RSS] * 1.1) URSS = int(self.__queries[q][qs.QRY_RSS] / 1.1) except KeyError: RSS = (self.__queries[q][qs.QRY_LRSS] + self.__queries[q][qs.QRY_URSS]) / 2.0 LRSS = self.__queries[q][qs.QRY_LRSS] URSS = self.__queries[q][qs.QRY_URSS] qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where_clause): continue query_dicts.append({ qs.QRY_ENUM: enum, qs.QRY_QID: qid, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: '', qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_FIELD: sv.sql_info[field][0], qs.QRY_FIELDTYPE: sv.sql_info[field][1], qs.QRY_WHERECLAUSE: where_clause, qs.QRY_TYPE: self.__queries[q][qs.QRY_TYPE], qs.QRY_RSS: RSS, qs.QRY_LRSS: LRSS, qs.QRY_URSS: URSS, qs.QRY_KEYWORDLEN: kw_len, qs.QRY_SEARCHFOR: value }) self.bobs.append( aqb.KeywordQueryBatch( query_dicts, count, int((count + 1) / OVER_GENERATION_RATIO), True)) return self.bobs