def _string_equality_queries(self, field, dist, pdf_lower, pdf_upper):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        for _ in xrange(CLAUSE_OVER_GENERATION_RATIO):
            value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {})
            qid = qids.query_id()
            where = 'SUBSTR(%s,1,9) = \'\'%s\'\'' % (sv.sql_info[field][0], value[:9])
            query_dicts.append({qs.QRY_ENUM : qs.CAT.P7_FINAL, 
                                qs.QRY_QID : qids.full_where_has_been_seen(qid, where),
                                qs.QRY_DBNUMRECORDS : self.__db_size,
                                qs.QRY_DBRECORDSIZE : self.__row_width, 
                                qs.QRY_PERF : self.__perf,
                                qs.QRY_CAT : 'P7',
                                qs.QRY_SUBCAT : 'final', 
                                qs.QRY_WHERECLAUSE : where,
                                qs.QRY_FIELD : sv.sql_info[field][0],
                                qs.QRY_NEGATE : False,
                                qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                                qs.QRY_KEYWORDLEN : 9,
                                qs.QRY_SEARCHFOR : value[:9],
                                qs.QRY_SEARCHDELIMNUM : 1 })
    
        return aqb.WildcardQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                      1, False)     
    def _equality_queries(self, field, dist, pdf_lower, pdf_upper):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        for x in xrange(CLAUSE_OVER_GENERATION_RATIO):
            value = self.__dists[field].generate_pdf(pdf_lower, pdf_upper, {})
            qid = qids.query_id()
            (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where(field, value)                       
            query_dicts.append({qs.QRY_ENUM : qs.CAT.EQ, 
                                qs.QRY_QID : qids.full_where_has_been_seen(qid, where),
                                qs.QRY_DBNUMRECORDS : self.__db_size,
                                qs.QRY_DBRECORDSIZE : self.__row_width, 
                                qs.QRY_PERF : self.__perf,
                                qs.QRY_CAT : 'EQ',
                                qs.QRY_SUBCAT : '', 
                                qs.QRY_WHERECLAUSE : where,
                                qs.QRY_FIELD : sv.sql_info[field][0],
                                qs.QRY_NEGATE : False,
                                qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                                qs.QRY_VALUE : value})
    
        return aqb.EqualityQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                      1, False)  
Beispiel #3
0
 def refine_queries(self, agg_result):
     """
     Takes in 'agg_result' which is the result from the aggregator
     for this BOQ.
     Selects which queries should be recorded in the results database. 
     To do this it creates a new list of associated selected queries
     and pairs them with their results. 
     """
     #selecting queries that match.
     queries = []
     assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS])
     for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]):
         assert q
         assert r
         assert q[qs.QRY_QID] >= r[qs.QRY_QID]
         potential_queries = []
         for (value,
              value_result) in r[qs.QRY_FISHING_MATCHES_FOUND].iteritems():
             count = len(value_result)
             if qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count and\
                qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count:
                 (value, where) = self.format_value_and_where(
                     sv.sql_name_to_enum(q[qs.QRY_FIELD]), value)
                 q[qs.QRY_VALUE] = value
                 q[qs.QRY_WHERECLAUSE] = where
                 r[rdb.DBF_MATCHINGRECORDIDS] = value_result
                 potential_queries.append((q, r))
         if potential_queries:
             chosen_q = random.sample(potential_queries, 1)[0]
             chosen_q[0][qs.QRY_QID] = \
                 qids.full_where_has_been_seen(chosen_q[0][qs.QRY_QID],
                                               chosen_q[0][qs.QRY_WHERECLAUSE])
             queries.append(chosen_q)
     #capping at choose-num number of queries
     self.refined_queries_results = queries
    def _generate_queries(self):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """
        query_dicts = []
        for (field, dist) in self.__dists.iteritems():
            for q_template in self.__queries:
                query_dicts = []
                for count in xrange(q_template['no_queries']*\
                                    OVER_GENERATION_RATIO):
                    self.__count += 1
                    LOGGER.info('P9: Created %d out of %d queries' % \
                                (self.__count, self.__total))
                    lrss = (q_template[qs.QRY_LRSS]/self.__db_size)
                    urss = (q_template[qs.QRY_URSS]/self.__db_size)
                    (word_one, word_two) = dist.generate_alarmword(lrss, urss)
                    where = 'WORD_PROXIMITY(%s, \'\'%s\'\', \'\'%s\'\')' % \
                                        (sv.sql_info[field][0],
                                         word_one, word_two)
                    where_clause = "%s <= %d ORDER BY %s" % (where, q_template['distance'], where)
                    qid = qids.query_id()
                    if qid != qids.full_where_has_been_seen(qid,where_clause):
                        continue
                    query_dicts.append({ 
                           qs.QRY_ENUM : qs.CAT.P9_ALARM_WORDS,
                           qs.QRY_QID : qid,
                           qs.QRY_CAT : self.__cat, 
                           qs.QRY_SUBCAT : 'alarm-words', 
                           qs.QRY_DBNUMRECORDS : self.__db_size,
                           qs.QRY_DBRECORDSIZE : self.__row_width,
                           qs.QRY_PERF : self.__perf,
                           qs.QRY_FIELD: sv.sql_info[field][0],
                           qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                           qs.QRY_WHERECLAUSE : where_clause,
                           qs.QRY_LRSS : q_template[qs.QRY_LRSS],
                           qs.QRY_URSS : q_template[qs.QRY_URSS],
                           qs.QRY_ALARMWORDONE : word_one,
                           qs.QRY_ALARMWORDTWO : word_two,
                           qs.QRY_ALARMWORDDISTANCE : q_template['distance']})

                self.__bobs.append(aqb.AlarmQueryBatch(query_dicts, count,
                                                       int(count/OVER_GENERATION_RATIO),
                                                       True))        
        return self.__bobs   

    
            
            
            
            
            
            
            
            
      

            
Beispiel #5
0
    def _generate_range_queries(self, field, dist, q):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        num_queries = 0
        for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
            num_queries = count
            self.__count += 1
            LOGGER.info('P2: Created %d out of %d queries' %
                        (self.__count, self.__total))
            r_lower_cdf = q['rss_lower_cdf']
            r_upper_cdf = q['rss_upper_cdf']
            (lower, upper) = dist.generate_double_range(r_lower_cdf,
                                                        r_upper_cdf,
                                                        db_size=self.__db_size)
            qid = qids.query_id()
            if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]:
                where_clause = '%s BETWEEN %s AND %s' % \
                                    (sv.sql_info[field][0],
                                    sv.VAR_CONVERTERS[field].to_csv(lower),
                                    sv.VAR_CONVERTERS[field].to_csv(upper))
            else:
                where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % \
                                    (sv.sql_info[field][0],
                                    sv.VAR_CONVERTERS[field].to_csv(lower),
                                    sv.VAR_CONVERTERS[field].to_csv(upper))
            if qid != qids.full_where_has_been_seen(qid, where_clause):
                continue
            query_dicts.append({
                qs.QRY_ENUM: qs.CAT.P2_RANGE,
                qs.QRY_QID: qid,
                qs.QRY_DBNUMRECORDS: self.__db_size,
                qs.QRY_DBRECORDSIZE: self.__row_width,
                qs.QRY_PERF: self.__perf,
                qs.QRY_CAT: self.__cat,
                qs.QRY_SUBCAT: 'range',
                qs.QRY_WHERECLAUSE: where_clause,
                qs.QRY_FIELD: sv.sql_info[field][0],
                qs.QRY_NEGATE: False,
                qs.QRY_FIELDTYPE: sv.sql_info[field][1],
                qs.QRY_LRSS: q[qs.QRY_LRSS],
                qs.QRY_URSS: q[qs.QRY_URSS],
                qs.QRY_LBOUND: lower,
                qs.QRY_UBOUND: upper,
                qs.QRY_RANGE: 0
            })

        return aqb.RangeQueryBatch(
            query_dicts, num_queries,
            int((num_queries + 1) / OVER_GENERATION_RATIO), True)
 def _generate_short_queries(self, dist, q):
     '''
     Generates queries of the form .//LEAF
     '''
     query_dicts = []
     query_count = 0
     for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
         self.__count += 1
         query_cout = count
         LOGGER.info('P11: Created %d out of %d queries' % \
                     (self.__count, self.__total))
         r_lower = q[qs.QRY_LRSS] / (self.__db_size * xg.XML_DEPTH *
                                     xg.FAN_OUT)
         r_upper = q[qs.QRY_URSS] / (self.__db_size * xg.XML_DEPTH *
                                     xg.FAN_OUT)
         (field, value) = self._create_equality_leaf(dist, r_lower, r_upper)
         value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value)
         try:
             value = value.replace('\'', '\'\'')
         except TypeError:
             pass
         except AttributeError:
             pass
         if field in ['foo', 'age', 'income']:
             where = "xml_value(xml,\'//%s\', %s)" % (field, value)
         else:
             where = "xml_value(xml,\'//%s\', \'%s\')" % (field, value)
         xpath = field
         qid = qids.query_id()
         if qid != qids.full_where_has_been_seen(qid, where):
             continue
         query_dicts.append({
             qs.QRY_ENUM: qs.CAT.P11_SHORT,
             qs.QRY_QID: qid,
             qs.QRY_DBNUMRECORDS: self.__db_size,
             qs.QRY_DBRECORDSIZE: self.__row_width,
             qs.QRY_PERF: self.__perf,
             qs.QRY_CAT: self.__cat,
             qs.QRY_SUBCAT: 'eq-double-slash',
             qs.QRY_WHERECLAUSE: where,
             qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0],
             qs.QRY_NEGATE: False,
             qs.QRY_FIELDTYPE: 'string',
             qs.QRY_LRSS: q[qs.QRY_LRSS],
             qs.QRY_URSS: q[qs.QRY_URSS],
             qs.QRY_VALUE: value,
             qs.QRY_XPATH: xpath
         })
     return aqb.XmlQueryBatch(
         query_dicts, query_count,
         max(int((query_count + 1) / OVER_GENERATION_RATIO), 1), True)
    def _than_queries(self, field, dist, pdf_lower, pdf_upper, range_type):
        query_dicts = []
        for x in xrange(CLAUSE_OVER_GENERATION_RATIO):
            #generate the range specific aspects of the queries
            if range_type == 'greater':
                value = dist.generate_greater_than(pdf_lower, pdf_upper,
                                                   db_size = self.__db_size)
                enum = qs.CAT.P2_GREATER
                tail = ''
                comp = '>='
            else:
                value = dist.generate_less_than(pdf_lower, pdf_upper,
                                                db_size = self.__db_size)
                enum = qs.CAT.P2_LESS
                tail = TAIL
                comp = '<='
            #generate the where clauses    
            if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]:
                where_clause = '%s %s %s' % (sv.sql_info[field][0], comp,
                                             sv.VAR_CONVERTERS[field].to_csv(value))
            elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]:
                
                where_clause = 'SUBSTR(%s,1,9) %s \'\'%s\'\'' % (sv.sql_info[field][0], comp, 
                                                 sv.VAR_CONVERTERS[field].to_csv(value)[:9])
                value = value + tail
            else:
                where_clause = '%s %s \'\'%s\'\'' % (sv.sql_info[field][0], comp,
                                             sv.VAR_CONVERTERS[field].to_csv(value))
            
            qid = qids.query_id()
            query_dicts.append({qs.QRY_ENUM : enum, 
                                qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause),
                                qs.QRY_DBNUMRECORDS : self.__db_size,
                                qs.QRY_DBRECORDSIZE : self.__row_width, 
                                qs.QRY_PERF : self.__perf,
                                qs.QRY_CAT : 'P2',
                                qs.QRY_SUBCAT : range_type,
                                qs.QRY_WHERECLAUSE : where_clause,
                                qs.QRY_FIELD : sv.sql_info[field][0],
                                qs.QRY_NEGATE : False,
                                qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                                qs.QRY_VALUE : value,
                                qs.QRY_RANGE : 0
                                })
        return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                      1, False)   

            
Beispiel #8
0
    def _equality_queries(self, field, dist):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        query_dicts = []
        for q in xrange(len(self.__queries)):
            query_dicts = []
            for count in xrange(self.__queries[q]['no_queries'] *
                                OVER_GENERATION_RATIO):
                self.__count += 1
                logger.info('EQ: Created %d out of %d queries' %
                            (self.__count, self.__total))
                r_lower_cdf = self.__queries[q]['rss_lower_cdf']
                r_upper_cdf = self.__queries[q]['rss_upper_cdf']
                value = self.__dists[field].generate_pdf(
                    r_lower_cdf, r_upper_cdf, {})
                qid = qids.query_id()
                (value,
                 where) = aqb.EqualityFishingQueryBatch.format_value_and_where(
                     field, value)
                if qid != qids.full_where_has_been_seen(qid, where):
                    continue
                query_dicts.append({
                    qs.QRY_ENUM: qs.CAT.EQ,
                    qs.QRY_QID: qid,
                    qs.QRY_DBNUMRECORDS: self.__db_size,
                    qs.QRY_DBRECORDSIZE: self.__row_width,
                    qs.QRY_PERF: self.__perf,
                    qs.QRY_CAT: self.__cat,
                    qs.QRY_SUBCAT: '',
                    qs.QRY_WHERECLAUSE: where,
                    qs.QRY_FIELD: sv.sql_info[field][0],
                    qs.QRY_NEGATE: False,
                    qs.QRY_FIELDTYPE: sv.sql_info[field][1],
                    qs.QRY_LRSS: self.__queries[q][qs.QRY_LRSS],
                    qs.QRY_URSS: self.__queries[q][qs.QRY_URSS],
                    qs.QRY_VALUE: value
                })

            self.__bobs.append(
                aqb.EqualityQueryBatch(
                    query_dicts, count, int(
                        (count + 1) / OVER_GENERATION_RATIO), True))
 def make_equality_queries(self, field, value):
     '''
     creates equality query dictionaries
     '''
     (value, where) = aqb.EqualityFishingQueryBatch.format_value_and_where(
         field, value)
     qid = qids.query_id()
     return {
         qs.QRY_ENUM: qs.CAT.EQ,
         qs.QRY_QID: qids.full_where_has_been_seen(qid, where),
         qs.QRY_DBNUMRECORDS: self._db_size,
         qs.QRY_DBRECORDSIZE: self._row_width,
         qs.QRY_CAT: 'EQ',
         qs.QRY_SUBCAT: '',
         qs.QRY_WHERECLAUSE: where,
         qs.QRY_FIELD: sv.sql_info[field][0],
         qs.QRY_NEGATE: False,
         qs.QRY_FIELDTYPE: sv.sql_info[field][1],
         qs.QRY_VALUE: value
     }
 def _range_queries(self, field, dist, pdf_lower, pdf_upper):
     query_dicts = []
     tail = 'ZZZZZZZZZZZZZ'
     for x in xrange(CLAUSE_OVER_GENERATION_RATIO):
         (lower, upper) = dist.generate_double_range(pdf_lower, pdf_upper,
                                                         db_size = self.__db_size)
         qid = qids.query_id()
         if field in [sv.VARS.INCOME, sv.VARS.LAST_UPDATED]:
             where_clause = '%s BETWEEN %s AND %s' % (sv.sql_info[field][0], 
                                                  sv.VAR_CONVERTERS[field].to_csv(lower),
                                                  sv.VAR_CONVERTERS[field].to_csv(upper))
         elif field in [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]:
             where_clause = 'SUBSTR(%s,1,9) BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], 
                                                  sv.VAR_CONVERTERS[field].to_csv(lower)[:9],
                                                  sv.VAR_CONVERTERS[field].to_csv(upper)[:9])
             lower = sv.VAR_CONVERTERS[field].to_csv(lower)[:9]
             upper = sv.VAR_CONVERTERS[field].to_csv(upper)[:9]+TAIL
         else:
             where_clause = '%s BETWEEN \'\'%s\'\' AND \'\'%s\'\'' % (sv.sql_info[field][0], 
                                                  sv.VAR_CONVERTERS[field].to_csv(lower),
                                                  sv.VAR_CONVERTERS[field].to_csv(upper))
         query_dicts.append({qs.QRY_ENUM : qs.CAT.P2_RANGE, 
                             qs.QRY_QID : qids.full_where_has_been_seen(qid, where_clause),
                             qs.QRY_DBNUMRECORDS : self.__db_size,
                             qs.QRY_DBRECORDSIZE : self.__row_width, 
                             qs.QRY_PERF : self.__perf,
                             qs.QRY_CAT : 'P2',
                             qs.QRY_SUBCAT : 'range', 
                             qs.QRY_WHERECLAUSE : where_clause,
                             qs.QRY_FIELD : sv.sql_info[field][0],
                             qs.QRY_NEGATE : False,
                             qs.QRY_FIELDTYPE : sv.sql_info[field][1],
                             qs.QRY_LBOUND : lower,
                             qs.QRY_UBOUND : upper,
                             qs.QRY_RANGE : 0
                             })
 
     return aqb.RangeQueryBatch(query_dicts, CLAUSE_OVER_GENERATION_RATIO,
                                   1, False)  
Beispiel #11
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries:
            self.refined_queries_results = refined_queries
            for (comp_q, comp_q_results) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    comp_q, comp_q_results, db_object)
                qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results,
                                                       db_object)
                comp_q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(comp_q['sub_queries'],
                        comp_q_results[qs.QRY_SUBRESULTS]))
                #print out the query
                self._print_query(comp_q, query_file_handle)
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                try:
                    num_clauses = comp_q[qs.QRY_NUMCLAUSES]
                except KeyError:
                    num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                working_clauses = None
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)

                clause_q_r = sorted(
                    clause_q_r,
                    key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS]))
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                working_clauses = []
                q_refined = False
                for clause in clause_q_r:
                    #don't need to check permuations if ftm doesn't match
                    if q_refined == True:
                        continue
                    ftm_match = len(clause[1][rdb.DBF_MATCHINGRECORDIDS])
                    if not all([
                            ftm_match >= qbs.get_tm_rss_lower(
                                comp_q[qs.QRY_ENUM]), ftm_match <=
                            qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #alright ftm matches, let's check the rest of the clauses
                    for clause_set in itertools.combinations(
                            clause_q_r, num_clauses - 1):
                        #query has already been refined
                        if q_refined == True:
                            continue
                        clause_list = [clause] + list(clause_set)
                        #check to see if any of the clauses or their fields are the same
                        #if so we know the intersection is one we are not interested in
                        values = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in clause_list
                        ]
                        fields = [
                            q[qs.QRY_FIELD] for ((q, _), _) in clause_list
                        ]
                        #there are duplicate values or this where has already been seen
                        if len(values)!=len(set(values)) or\
                           len(fields)!=len(set(fields)) or\
                            values in seen_where_group:
                            continue
                        seen_where_group.append(values)

                        #check conditions
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in clause_list
                        ])
                        count = len(matching_ids_set)

                        if not all([
                                count >= qbs.get_rss_lower(
                                    comp_q[qs.QRY_ENUM]), count <=
                                qbs.get_rss_upper(comp_q[qs.QRY_ENUM]),
                                ftm_match >= qbs.get_tm_rss_lower(
                                    comp_q[qs.QRY_ENUM]), ftm_match <=
                                qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                        ]):
                            continue

                        #this combination worked, so don't need to refine further for this
                        #particular query
                        q_refined = True
                        refined_total += 1
                        #reorder clauses
                        working_clauses = clause_list
                        reordered_clauses = working_clauses[:1]
                        working_clauses.remove(reordered_clauses[0])
                        cumulative_set = set(
                            reordered_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])
                        while len(working_clauses) > 0:
                            next_clause = working_clauses[0]
                            current_set = cumulative_set.intersection(
                                working_clauses[0][1][
                                    rdb.DBF_MATCHINGRECORDIDS])
                            for clauses in working_clauses:
                                potential_set = cumulative_set.intersection(
                                    clauses[1][rdb.DBF_MATCHINGRECORDIDS])
                                if len(potential_set) < len(current_set):
                                    next_clause = clauses
                                    current_set = potential_set
                            working_clauses.remove(next_clause)
                            reordered_clauses.append(next_clause)
                            cumulative_set = current_set

                        working_clauses = reordered_clauses

                        #update query with chosen clauses
                        whereclauses = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses)
                        comp_q['sub_queries'] = [
                            q for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_SUBBOBS] = [
                            b for ((_, b), _) in working_clauses
                        ]

                        ftm_match = len(
                            working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in working_clauses
                        ])
                        comp_q_results[qs.QRY_SUBRESULTS] = [
                            r for (_, r) in working_clauses
                        ]
                        comp_q_results[
                            rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                        comp_q_results[
                            qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match

                        #get the id's lined up
                        comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                            comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                        comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                        for (sub_q,
                             sub_r) in zip(comp_q['sub_queries'],
                                           comp_q_results[qs.QRY_SUBRESULTS]):
                            sub_q[
                                qs.QRY_QID] = qids.atomic_where_has_been_seen(
                                    sub_q[qs.QRY_QID],
                                    sub_q[qs.QRY_WHERECLAUSE])
                            sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                        #write the results to the results database
                        qr.QueryResultBase.write_to_full_to_atomic_table(
                            comp_q, comp_q_results, db_object)
                        qr.QueryResultBase.write_to_full_table(
                            comp_q, comp_q_results, db_object)
                        comp_q[qs.QRY_SUBBOBS][0].process_results(
                            None, db_object, query_file_handle,
                            zip(comp_q['sub_queries'],
                                comp_q_results[qs.QRY_SUBRESULTS]))
                        #print out the query
                        self._print_query(comp_q, query_file_handle)
                        refined_queries.append((comp_q, comp_q_results))

                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION qid = %d, where_clause = %s, ftm = %d, rss = %d"
                        % (comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE],
                           ftm_match, count))

            #capping at choose-num number of queries
            self.refined_queries_results = refined_queries
Beispiel #12
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries:
            self.refined_queries_results = refined_queries
            for (comp_q, comp_q_results) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    comp_q, comp_q_results, db_object)
                qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results,
                                                       db_object)
                comp_q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(comp_q['sub_queries'],
                        comp_q_results[qs.QRY_SUBRESULTS]))
                #print out the query
                self._print_query(comp_q, query_file_handle)
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]

                num_clauses = comp_q[qs.QRY_NUMCLAUSES]

                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                working_clauses = None
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r
                              if len(r[rdb.DBF_MATCHINGRECORDIDS]) <=
                              qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])]
                if len(clause_q_r) < num_clauses:
                    continue
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                working_clauses = []
                q_refined = False
                for clause_set in itertools.combinations(
                        clause_q_r, num_clauses):
                    #query has already been refined
                    if q_refined == True:
                        continue
                    clause_list = list(clause_set)
                    values = [
                        q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list
                    ]
                    #there are duplicate values or this where has already been seen
                    if len(values)!=len(set(values)) or\
                        values in seen_where_group:
                        continue
                    seen_where_group.append(values)

                    #check conditions
                    matching_ids_set = reduce(set.union, [
                        set(r[rdb.DBF_MATCHINGRECORDIDS])
                        for (_, r) in clause_list
                    ])
                    count = len(matching_ids_set)
                    all_match = sum(
                        map(len, [
                            r[rdb.DBF_MATCHINGRECORDIDS]
                            for (_, r) in clause_list
                        ]))
                    if not all([
                            count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]),
                            count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]),
                            all_match >= qbs.get_tm_rss_lower(
                                comp_q[qs.QRY_ENUM]), all_match <=
                            qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #this combination worked, so don't need to refine further for this
                    #particular query
                    q_refined = True
                    refined_total += 1
                    working_clauses = clause_list
                    #update query with chosen clauses
                    whereclauses = [
                        q[qs.QRY_WHERECLAUSE]
                        for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_WHERECLAUSE] = " OR ".join(whereclauses)
                    comp_q['sub_queries'] = [
                        q for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_SUBBOBS] = [
                        b for ((_, b), _) in working_clauses
                    ]

                    ftm_match = len(
                        working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])

                    comp_q_results[qs.QRY_SUBRESULTS] = [
                        r for (_, r) in working_clauses
                    ]
                    comp_q_results[
                        rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                    comp_q_results[
                        qs.QRY_SUMRECORDSMATCHINGEACHTERM] = all_match
                    comp_q_results[
                        qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match

                    #make sure duplicate queries (and their atomic sub_components) have the same qids
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = q[qs.QRY_QID]
                    for (sub_q,
                         sub_r) in zip(comp_q['sub_queries'],
                                       comp_q_results[qs.QRY_SUBRESULTS]):
                        sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                            sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                        sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                    #create result objects and write to ground truth database
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    refined_queries.append((comp_q, comp_q_results))

                    #print query
                    self._print_query(comp_q, query_file_handle)

                #make where clause, update and with chosen queries and the aggregator results
                #with the chosen results
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], all_match, count))

            self.refined_queries_results = refined_queries
    def _generate_full_queries(self, dist, q):
        '''
        Generates queries of the form ./node1/node2/LEAF
        '''
        query_dicts = []
        for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO):
            self.__count += 1
            LOGGER.info('P11: Created %d out of %d queries' % \
                        (self.__count, self.__total))
            r_lower_total = q[qs.QRY_LRSS] / self.__db_size
            r_upper_total = q[qs.QRY_URSS] / self.__db_size
            branch_r_lower = pow(r_lower_total / xg.XML_DEPTH,
                                 1.0 / (xg.XML_DEPTH))
            branch_r_upper = pow(r_upper_total / xg.XML_DEPTH,
                                 1.0 / (xg.XML_DEPTH))

            tags = []
            for level in xrange(xg.XML_DEPTH - 1):
                tags.append(
                    dist.generate_node_pdf(level, branch_r_lower,
                                           branch_r_upper))
            tag_string = ''
            for tag in tags:
                tag_string += "/%s" % (tag)
            (field,
             value) = self._create_equality_leaf(dist, branch_r_lower,
                                                 branch_r_upper)

            value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value)
            try:
                value = value.replace('\'', '\'\'')
            except TypeError:
                pass
            except AttributeError:
                pass
            if field in ['foo', 'age', 'income']:
                where = "xml_value(xml,\'/xml%s/%s\',%s)" % (tag_string, field,
                                                             value)
            else:
                where = "xml_value(xml,\'/xml%s/%s\',\'%s\')" % (tag_string,
                                                                 field, value)

            xpath = ['xml'] + tags
            xpath.append(field)
            qid = qids.query_id()
            if qid != qids.full_where_has_been_seen(qid, where):
                continue
            query_dicts.append({
                qs.QRY_ENUM: qs.CAT.P11_FULL,
                qs.QRY_QID: qid,
                qs.QRY_DBNUMRECORDS: self.__db_size,
                qs.QRY_DBRECORDSIZE: self.__row_width,
                qs.QRY_CAT: self.__cat,
                qs.QRY_SUBCAT: 'eq-full',
                qs.QRY_PERF: self.__perf,
                qs.QRY_WHERECLAUSE: where,
                qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0],
                qs.QRY_NEGATE: False,
                qs.QRY_FIELDTYPE: 'string',
                qs.QRY_LRSS: q[qs.QRY_LRSS],
                qs.QRY_URSS: q[qs.QRY_URSS],
                qs.QRY_VALUE: value,
                qs.QRY_XPATH: xpath
            })
        return aqb.XmlQueryBatch(
            query_dicts, count, max(int((count + 1) / OVER_GENERATION_RATIO),
                                    1), True)
Beispiel #14
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries != None:
            self.refined_queries_results = refined_queries
            for (q, r) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    q, r, db_object)
                qr.QueryResultBase.write_to_full_table(q, r, db_object)
                q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
                self._print_query(q, query_file_handle)

                try:
                    q[qs.QRY_PERF].remove('IBM1')
                except ValueError:
                    pass

                q[qs.QRY_WHERECLAUSE] = q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\
                                        q[qs.QRY_WHERECLAUSE] + " DESC"
                q[qs.QRY_ENUM] = qs.CAT.P9_EQ
                q[qs.QRY_CAT] = 'P9'
                q[qs.QRY_QID] = qids.full_where_has_been_seen(
                    qids.query_id(), q[qs.QRY_WHERECLAUSE])
                r[qs.QRY_QID] = q[qs.QRY_QID]
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    q, r, db_object)
                qr.QueryResultBase.write_to_full_table(q, r, db_object)
                self._print_query(q, query_file_handle)
                q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = sorted(
                    clause_q_r,
                    key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS]))
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                comp_q_refined = False
                for clause_set in itertools.combinations(
                        clause_q_r, num_clauses):
                    if comp_q_refined == True:
                        continue
                    clause_list = list(clause_set)
                    values = [
                        q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list
                    ]
                    if len(values) != len(
                            set(values)) or values in seen_where_group:
                        continue
                    seen_where_group.append(values)

                    #check to see if it is working
                    #if stfm doesn't match, don't bother continuing
                    stfm = 0
                    for offset in xrange(comp_q[qs.QRY_N] - comp_q[qs.QRY_M] +
                                         1):
                        (_, r) = clause_list[offset]
                        stfm += len(r[rdb.DBF_MATCHINGRECORDIDS])
                    if not all([
                            stfm >= qbs.get_tm_rss_lower(comp_q[qs.QRY_ENUM]),
                            stfm <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #if stfm does match, calculate the set intersection
                    matching_ids_set = set()
                    for m_set in itertools.combinations(
                            clause_list, comp_q[qs.QRY_M]):
                        matching_ids_set.update(
                            reduce(set.intersection, [
                                set(r[rdb.DBF_MATCHINGRECORDIDS])
                                for (_, r) in m_set
                            ]))
                    count = len(matching_ids_set)

                    #check overall compliance
                    if not all([
                            count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]),
                            count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue

                    comp_q_refined = True
                    refined_total += 1
                    ##PROCESSING THE WORKING CLAUSE_LIST
                    working_clauses = clause_list
                    whereclauses = [
                        q[qs.QRY_WHERECLAUSE]
                        for ((q, _), _) in working_clauses
                    ]
                    where = ", ".join(whereclauses)
                    where = 'M_OF_N(%d, %d, %s)' % (comp_q[qs.QRY_M],
                                                    comp_q[qs.QRY_N], where)
                    #update query with chosen clauses
                    comp_q[qs.QRY_WHERECLAUSE] = where
                    comp_q['sub_queries'] = [
                        q for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_SUBBOBS] = [
                        b for ((_, b), _) in working_clauses
                    ]

                    #have to create a list of counts of how many that match N terms, n-1 terms...
                    #until m. Such of the form 34 | 384 | 1094
                    records_matching_count = dict(
                        zip(range(comp_q[qs.QRY_M], comp_q[qs.QRY_N] + 1),
                            [0] * comp_q[qs.QRY_N]))
                    for id in matching_ids_set:
                        matching_terms = [
                            1 if id in clause[1][rdb.DBF_MATCHINGRECORDIDS]
                            else 0 for clause in working_clauses
                        ]
                        term_matches = sum(matching_terms)
                        records_matching_count[term_matches] += 1
                    matching_records_counts = sorted(
                        records_matching_count.values(), reverse=True)
                    #update the results dictionary with the new calculated values
                    comp_q_results[qs.QRY_SUBRESULTS] = [
                        r for (_, r) in working_clauses
                    ]
                    comp_q_results[
                        rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                    comp_q_results[
                        qs.QRY_MATCHINGRECORDCOUNTS] = matching_records_counts

                    #make sure duplicate queries (and their atomic sub_components) have the same qids
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                    for (sub_q,
                         sub_r) in zip(comp_q['sub_queries'],
                                       comp_q_results[qs.QRY_SUBRESULTS]):
                        sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                            sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                        sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                    #write queries to the results database
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    self._print_query(comp_q, query_file_handle)

                    try:
                        comp_q[qs.QRY_PERF].remove('IBM1')
                    except ValueError:
                        pass

                    comp_q[qs.QRY_WHERECLAUSE] = comp_q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\
                                                 comp_q[qs.QRY_WHERECLAUSE] + " DESC"
                    comp_q[qs.QRY_ENUM] = qs.CAT.P9_EQ
                    comp_q[qs.QRY_CAT] = 'P9'
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        qids.query_id(), comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    self._print_query(comp_q, query_file_handle)
                    refined_queries.append((comp_q, comp_q_results))
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if comp_q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], stfm, count))
        self.refined_queries_results = refined_queries
Beispiel #15
0
    def produce_query_batches(self):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """

        self.bobs = []
        query_dicts = []
        for f in xrange(len(self.__dists)):
            for q in xrange(len(self.__queries)):
                query_dicts = []
                for count in xrange(self.__queries[q]['no_queries'] *
                                    OVER_GENERATION_RATIO):
                    for r in xrange(self.__queries[q][qs.QRY_RANGEEXPL],
                                    self.__queries[q][qs.QRY_RANGEEXPU] + 1):
                        self.__count += 1
                        logger.debug('P2-foo: Created %d out of %d queries' %
                                     (self.__count, self.__total))
                        field = sv.sql_info[self.__fields[f]][0]
                        rss_lower = self.__queries[q][qs.QRY_LRSS]
                        rss_upper = self.__queries[q][qs.QRY_URSS]
                        rss_avg = (rss_lower + rss_upper) / 2
                        range = 2**r
                        if self.__queries[q][qs.QRY_TYPE] == 'range':
                            try:
                                (lower, upper) = self.__dists[
                                    self.__fields[f]].generate_two_sided(
                                        rss_avg, range, self.__db_size)
                            except bd.FooInputs:
                                (lower, upper) = (0, 0)
                            enum = qs.CAT.P2_RANGE_FOO
                            where_clause = '%s BETWEEN %d AND %d' % (
                                field, lower, upper)
                        else:
                            try:
                                lower = self.__dists[
                                    self.__fields[f]].generate_greater_than(
                                        rss_avg, self.__db_size)
                            except bd.FooInputs:
                                lower = 0
                            upper = 2**64 - 1
                            enum = qs.CAT.P2_GREATER_FOO
                            where_clause = '%s >= %d' % (field, lower)
                        qid = qids.query_id()
                        if qid != qids.full_where_has_been_seen(
                                qid, where_clause):
                            continue
                        query_dicts.append({
                            qs.QRY_ENUM:
                            enum,
                            qs.QRY_QID:
                            qid,
                            qs.QRY_CAT:
                            self.__cat,
                            qs.QRY_SUBCAT:
                            self.__queries[q][qs.QRY_TYPE],
                            qs.QRY_DBNUMRECORDS:
                            self.__db_size,
                            qs.QRY_DBRECORDSIZE:
                            self.__row_width,
                            qs.QRY_PERF:
                            self.__perf,
                            qs.QRY_WHERECLAUSE:
                            where_clause,
                            qs.QRY_FIELD:
                            field,
                            qs.QRY_FIELDTYPE:
                            sv.sql_info[self.__fields[f]][1],
                            qs.QRY_TYPE:
                            self.__queries[q][qs.QRY_TYPE],
                            qs.QRY_LRSS:
                            rss_lower,
                            qs.QRY_URSS:
                            rss_upper,
                            qs.QRY_RANGEEXP:
                            r,
                            qs.QRY_LBOUND:
                            lower,
                            qs.QRY_UBOUND:
                            upper,
                            qs.QRY_RANGE:
                            upper - lower
                        })

                self.bobs.append(
                    aqb.FooRangeQueryBatch(
                        query_dicts, len(query_dicts),
                        int(len(query_dicts) / OVER_GENERATION_RATIO), True))
        return self.bobs
Beispiel #16
0
 def testFullWhere(self):
     self.assertEqual(qids.full_where_has_been_seen(1, "it"), 1)
     self.assertEqual(qids.full_where_has_been_seen(2, "it"), 1)
     qids.reset_full_where()
     self.assertEqual(qids.full_where_has_been_seen(1, "it"), 1)
Beispiel #17
0
    def produce_query_batches(self):
        """
        This generates returns a query_batch object that holds the logic
        for creating aggregators for the queries, and also contains the 
        logic for processing the results and printing the query
        """
        self.bobs = []
        query_dicts = []
        for f in xrange(len(self.__dists)):
            for q in xrange(len(self.__queries)):
                query_dicts = []
                for count in xrange(self.__queries[q]['no_queries'] *
                                    OVER_GENERATION_RATIO):
                    self.__count += 1
                    logger.info('P3/P4: Created %d out of %d queries' %
                                (self.__count, self.__total))
                    field = self.__fields[f]
                    r_u_pdf = self.__queries[q]['rss_u_pdf']
                    r_l_pdf = self.__queries[q]['rss_l_pdf']
                    kw_len = self.__queries[q][qs.QRY_KEYWORDLEN]
                    if self.__queries[q][qs.QRY_TYPE] == 'word':
                        enum = qs.CAT.P3
                        value = self.__dists[field].generate_word(
                            kw_len, r_l_pdf, r_u_pdf)
                        where_clause = 'CONTAINED_IN(%s, \'\'%s\'\')' % (
                            sv.sql_info[field][0], value.replace(
                                '\'', "\\'").lower())
                    else:
                        enum = qs.CAT.P4
                        (value, word) = self.__dists[field].generate_antistem(
                            kw_len, r_l_pdf, r_u_pdf)
                        where_clause = 'CONTAINS_STEM(%s, \'\'%s\'\')' % (
                            sv.sql_info[field][0], word.replace('\'',
                                                                "\\'").lower())
                    try:
                        RSS = self.__queries[q][qs.QRY_RSS]
                        LRSS = int(self.__queries[q][qs.QRY_RSS] * 1.1)
                        URSS = int(self.__queries[q][qs.QRY_RSS] / 1.1)
                    except KeyError:
                        RSS = (self.__queries[q][qs.QRY_LRSS] +
                               self.__queries[q][qs.QRY_URSS]) / 2.0
                        LRSS = self.__queries[q][qs.QRY_LRSS]
                        URSS = self.__queries[q][qs.QRY_URSS]

                    qid = qids.query_id()
                    if qid != qids.full_where_has_been_seen(qid, where_clause):
                        continue
                    query_dicts.append({
                        qs.QRY_ENUM:
                        enum,
                        qs.QRY_QID:
                        qid,
                        qs.QRY_CAT:
                        self.__cat,
                        qs.QRY_SUBCAT:
                        '',
                        qs.QRY_DBNUMRECORDS:
                        self.__db_size,
                        qs.QRY_DBRECORDSIZE:
                        self.__row_width,
                        qs.QRY_PERF:
                        self.__perf,
                        qs.QRY_FIELD:
                        sv.sql_info[field][0],
                        qs.QRY_FIELDTYPE:
                        sv.sql_info[field][1],
                        qs.QRY_WHERECLAUSE:
                        where_clause,
                        qs.QRY_TYPE:
                        self.__queries[q][qs.QRY_TYPE],
                        qs.QRY_RSS:
                        RSS,
                        qs.QRY_LRSS:
                        LRSS,
                        qs.QRY_URSS:
                        URSS,
                        qs.QRY_KEYWORDLEN:
                        kw_len,
                        qs.QRY_SEARCHFOR:
                        value
                    })

                self.bobs.append(
                    aqb.KeywordQueryBatch(
                        query_dicts, count,
                        int((count + 1) / OVER_GENERATION_RATIO), True))
        return self.bobs
Beispiel #18
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries:
            self.refined_queries_results = refined_queries
        else:
            refined_queries = []
            refined_total = 0
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                try:
                    num_clauses = comp_q[qs.QRY_NUMCLAUSES]
                except KeyError:
                    num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                working_clauses = None
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries
                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r
                              if r[qs.QRY_VALID]]
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                working_clauses = []
                q_refined = False
                for clause in clause_q_r:
                    for clause_set in itertools.combinations(
                            clause_q_r, num_clauses - 1):
                        if q_refined == True:
                            continue
                        clause_list = [clause] + list(clause_set)
                        values = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in clause_list
                        ]
                        if len(values) != len(
                                set(values)) or values in seen_where_group:
                            continue
                        seen_where_group.append(values)
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in clause_list
                        ])
                        count = len(matching_ids_set)
                        P2_cats = [
                            q for ((q, _), _) in clause_list
                            if q[qs.QRY_CAT] == 'P2'
                        ]
                        if not all([
                                count >= qbs.get_rss_lower(
                                    comp_q[qs.QRY_ENUM]), count <=
                                qbs.get_rss_upper(comp_q[qs.QRY_ENUM]),
                                len(P2_cats) <= 1
                        ]):
                            continue

                        #this combination worked, so don't need to refine further for this
                        #particular query
                        q_refined = True
                        refined_total += 1
                        working_clauses = clause_list
                        #reorder clauses
                        re_ordered_clauses = []
                        last_clause = None
                        for ((q, b), r) in working_clauses:
                            if q[qs.QRY_CAT] == 'P2':
                                last_clause = ((q, b), r)
                            else:
                                re_ordered_clauses.append(((q, b), r))
                        if last_clause:
                            re_ordered_clauses.append(last_clause)

                        working_clauses = re_ordered_clauses

                        #update query with chosen clauses
                        whereclauses = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses)
                        comp_q['sub_queries'] = [
                            q for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_SUBBOBS] = [
                            b for ((_, b), _) in working_clauses
                        ]
                        ftm_match = len(
                            working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in working_clauses
                        ])
                        comp_q_results[qs.QRY_SUBRESULTS] = [
                            r for (_, r) in working_clauses
                        ]
                        comp_q_results[
                            rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                        comp_q_results[
                            qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match

                        refined_queries.append((comp_q, comp_q_results))

                #make where clause, update and with chosen queries and the aggregator results
                #with the chosen results
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, ftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], ftm_match, count))

            for (q, r) in refined_queries:
                q[qs.QRY_QID] = qids.full_where_has_been_seen(
                    q[qs.QRY_QID], q[qs.QRY_WHERECLAUSE])
                r[qs.QRY_QID] = q[qs.QRY_QID]
                for (sub_q, sub_r) in zip(q['sub_queries'],
                                          r[qs.QRY_SUBRESULTS]):
                    sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                        sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                    sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

            #capping at choose-num number of queries
            self.refined_queries_results = refined_queries
            #create result objects and write to ground truth database
        for (q, r) in self.refined_queries_results:
            qr.QueryResultBase.write_to_full_to_atomic_table(q, r, db_object)
            qr.QueryResultBase.write_to_full_table(q, r, db_object)
            q[qs.QRY_SUBBOBS][0].process_results(
                None, db_object, query_file_handle,
                zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
        #writing queries in sql format to file
        for (q, _) in self.refined_queries_results:
            if q != None:
                self._print_query(q, query_file_handle)