Ejemplo n.º 1
0
 def testRSSUpper(self):
     qbs.set_result_set_size_range_upper('one', 10)
     qbs.set_result_set_size_range_upper('two', 100)
     qbs.set_result_set_size_range_upper('one', 1)
     qbs.set_result_set_size_range_upper('two', 1000)
     self.assertEqual(qbs.get_rss_upper('one'), 10)
     self.assertEqual(qbs.get_rss_upper('two'), 1000)
Ejemplo n.º 2
0
    def refine_queries(self, agg_result):
        """
        Takes in 'agg_result' which is the result from the aggregator
        for this BOQ.
        
        Selects which queries should be recorded in the results database. 
        To discard a query it simply drops it and the associated result.
        """
        range_seen = set()

        #selecting queries that match.
        queries = []
        assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS])
        for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]):
            assert q
            assert r
            assert q[qs.QRY_QID] >= r[qs.QRY_QID]
            count = len(r[rdb.DBF_MATCHINGRECORDIDS])
            #Weed out incorrect counts and previously seen range values
            if all([
                    qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count,
                    qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count,
                    q[qs.QRY_RANGEEXP] not in range_seen, r[qs.QRY_VALID]
            ]):
                queries.append((q, r))
                range_seen.add(q[qs.QRY_RANGEEXP])

        #capping at choose-num number of queries
        self.refined_queries_results = queries
Ejemplo n.º 3
0
 def refine_queries(self, agg_result):
     """
     Takes in 'agg_result' which is the result from the aggregator
     for this BOQ.
     Selects which queries should be recorded in the results database. 
     To do this it creates a new list of associated selected queries
     and pairs them with their results. 
     """
     #selecting queries that match.
     queries = []
     assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS])
     for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]):
         assert q
         assert r
         assert q[qs.QRY_QID] >= r[qs.QRY_QID]
         potential_queries = []
         for (value,
              value_result) in r[qs.QRY_FISHING_MATCHES_FOUND].iteritems():
             count = len(value_result)
             if qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count and\
                qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count:
                 (value, where) = self.format_value_and_where(
                     sv.sql_name_to_enum(q[qs.QRY_FIELD]), value)
                 q[qs.QRY_VALUE] = value
                 q[qs.QRY_WHERECLAUSE] = where
                 r[rdb.DBF_MATCHINGRECORDIDS] = value_result
                 potential_queries.append((q, r))
         if potential_queries:
             chosen_q = random.sample(potential_queries, 1)[0]
             chosen_q[0][qs.QRY_QID] = \
                 qids.full_where_has_been_seen(chosen_q[0][qs.QRY_QID],
                                               chosen_q[0][qs.QRY_WHERECLAUSE])
             queries.append(chosen_q)
     #capping at choose-num number of queries
     self.refined_queries_results = queries
Ejemplo n.º 4
0
 def __init__(self, query):
     self._qid = query[qs.QRY_QID]
     self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD])
     self._alarmwords = set([query[qs.QRY_ALARMWORDONE], query[qs.QRY_ALARMWORDTWO]])
     self._alarmword_distance = query[qs.QRY_ALARMWORDDISTANCE]
     self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM])
     self._count = 0
Ejemplo n.º 5
0
 def __init__(self, query):        
     ''' Initialize the needed class variables from the query '''
     self._qid = query[qs.QRY_QID]
     self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD])
     # try/except block is mostly for backwards compatability
     # with unit tests
     try:
         self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM])
     except KeyError:
         self._process_cutoff = 100000
     self._count = 0
     #If the query is atomic (i.e. top level), we want to apply a limit
     #on what it can collect, otherwise we want no process limit in
     #effect
     try:
         self._top_level = query['top_level']
     except KeyError:
         self._top_level = True
Ejemplo n.º 6
0
    def refine_queries(self, agg_result):
        '''
        Selects the queries that work given the chosen result set size, 
        and once those are selected it correctly orders the results with
        those rows who the words are closer towards the front of the 
        id list
        '''
        queries = []
        assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS])
        for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]):
            assert q
            assert r
            assert q[qs.QRY_QID] >= r[qs.QRY_QID]
            row_dist = r[qs.QRY_MATCHINGROWIDANDDISTANCES]
            count = len(row_dist)
            if all([
                    qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count,
                    qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count, r[qs.QRY_VALID]
            ]):
                dist_dict = {}
                for (row_id, dist) in row_dist:
                    try:
                        dist_dict[dist].append(row_id)
                    except KeyError:
                        dist_dict[dist] = [row_id]
                ids = []
                counts = []
                for (dist, row_ids) in sorted(dist_dict.iteritems(),
                                              reverse=False):
                    ids += row_ids
                    counts.append(len(row_ids))
                r[rdb.DBF_MATCHINGRECORDIDS] = ids
                r[qs.QRY_MATCHINGRECORDCOUNTS] = '|'.join(
                    map(str, sorted(counts, reverse=False)))
                queries.append((q, r))

        #capping at choose-num number of queries
        self.refined_queries_results = queries
Ejemplo n.º 7
0
    def refine_queries(self, agg_result):
        """
        Takes in 'agg_result' which is the result from the aggregator
        for this BOQ.
        Selects which queries should be recorded in the results database. 
        To do this it creates a new list of associated selected queries
        and pairs them with their results. 
        """
        #selecting queries that match.
        queries = []
        assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS])
        for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]):
            assert q
            assert r
            assert q[qs.QRY_QID] >= r[qs.QRY_QID]
            count = len(r[rdb.DBF_MATCHINGRECORDIDS])
            if all([
                    qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count,
                    qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count, r[qs.QRY_VALID]
            ]):
                queries.append((q, r))

        #capping at choose-num number of queries
        self.refined_queries_results = queries
Ejemplo n.º 8
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries != None:
            self.refined_queries_results = refined_queries
            for (q, r) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    q, r, db_object)
                qr.QueryResultBase.write_to_full_table(q, r, db_object)
                q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
                self._print_query(q, query_file_handle)

                try:
                    q[qs.QRY_PERF].remove('IBM1')
                except ValueError:
                    pass

                q[qs.QRY_WHERECLAUSE] = q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\
                                        q[qs.QRY_WHERECLAUSE] + " DESC"
                q[qs.QRY_ENUM] = qs.CAT.P9_EQ
                q[qs.QRY_CAT] = 'P9'
                q[qs.QRY_QID] = qids.full_where_has_been_seen(
                    qids.query_id(), q[qs.QRY_WHERECLAUSE])
                r[qs.QRY_QID] = q[qs.QRY_QID]
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    q, r, db_object)
                qr.QueryResultBase.write_to_full_table(q, r, db_object)
                self._print_query(q, query_file_handle)
                q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = sorted(
                    clause_q_r,
                    key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS]))
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                comp_q_refined = False
                for clause_set in itertools.combinations(
                        clause_q_r, num_clauses):
                    if comp_q_refined == True:
                        continue
                    clause_list = list(clause_set)
                    values = [
                        q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list
                    ]
                    if len(values) != len(
                            set(values)) or values in seen_where_group:
                        continue
                    seen_where_group.append(values)

                    #check to see if it is working
                    #if stfm doesn't match, don't bother continuing
                    stfm = 0
                    for offset in xrange(comp_q[qs.QRY_N] - comp_q[qs.QRY_M] +
                                         1):
                        (_, r) = clause_list[offset]
                        stfm += len(r[rdb.DBF_MATCHINGRECORDIDS])
                    if not all([
                            stfm >= qbs.get_tm_rss_lower(comp_q[qs.QRY_ENUM]),
                            stfm <= qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #if stfm does match, calculate the set intersection
                    matching_ids_set = set()
                    for m_set in itertools.combinations(
                            clause_list, comp_q[qs.QRY_M]):
                        matching_ids_set.update(
                            reduce(set.intersection, [
                                set(r[rdb.DBF_MATCHINGRECORDIDS])
                                for (_, r) in m_set
                            ]))
                    count = len(matching_ids_set)

                    #check overall compliance
                    if not all([
                            count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]),
                            count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue

                    comp_q_refined = True
                    refined_total += 1
                    ##PROCESSING THE WORKING CLAUSE_LIST
                    working_clauses = clause_list
                    whereclauses = [
                        q[qs.QRY_WHERECLAUSE]
                        for ((q, _), _) in working_clauses
                    ]
                    where = ", ".join(whereclauses)
                    where = 'M_OF_N(%d, %d, %s)' % (comp_q[qs.QRY_M],
                                                    comp_q[qs.QRY_N], where)
                    #update query with chosen clauses
                    comp_q[qs.QRY_WHERECLAUSE] = where
                    comp_q['sub_queries'] = [
                        q for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_SUBBOBS] = [
                        b for ((_, b), _) in working_clauses
                    ]

                    #have to create a list of counts of how many that match N terms, n-1 terms...
                    #until m. Such of the form 34 | 384 | 1094
                    records_matching_count = dict(
                        zip(range(comp_q[qs.QRY_M], comp_q[qs.QRY_N] + 1),
                            [0] * comp_q[qs.QRY_N]))
                    for id in matching_ids_set:
                        matching_terms = [
                            1 if id in clause[1][rdb.DBF_MATCHINGRECORDIDS]
                            else 0 for clause in working_clauses
                        ]
                        term_matches = sum(matching_terms)
                        records_matching_count[term_matches] += 1
                    matching_records_counts = sorted(
                        records_matching_count.values(), reverse=True)
                    #update the results dictionary with the new calculated values
                    comp_q_results[qs.QRY_SUBRESULTS] = [
                        r for (_, r) in working_clauses
                    ]
                    comp_q_results[
                        rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                    comp_q_results[
                        qs.QRY_MATCHINGRECORDCOUNTS] = matching_records_counts

                    #make sure duplicate queries (and their atomic sub_components) have the same qids
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                    for (sub_q,
                         sub_r) in zip(comp_q['sub_queries'],
                                       comp_q_results[qs.QRY_SUBRESULTS]):
                        sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                            sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                        sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                    #write queries to the results database
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    self._print_query(comp_q, query_file_handle)

                    try:
                        comp_q[qs.QRY_PERF].remove('IBM1')
                    except ValueError:
                        pass

                    comp_q[qs.QRY_WHERECLAUSE] = comp_q[qs.QRY_WHERECLAUSE] + " ORDER BY " +\
                                                 comp_q[qs.QRY_WHERECLAUSE] + " DESC"
                    comp_q[qs.QRY_ENUM] = qs.CAT.P9_EQ
                    comp_q[qs.QRY_CAT] = 'P9'
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        qids.query_id(), comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    self._print_query(comp_q, query_file_handle)
                    refined_queries.append((comp_q, comp_q_results))
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if comp_q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], stfm, count))
        self.refined_queries_results = refined_queries
Ejemplo n.º 9
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries:
            self.refined_queries_results = refined_queries
            for (comp_q, comp_q_results) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    comp_q, comp_q_results, db_object)
                qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results,
                                                       db_object)
                comp_q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(comp_q['sub_queries'],
                        comp_q_results[qs.QRY_SUBRESULTS]))
                #print out the query
                self._print_query(comp_q, query_file_handle)
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]

                num_clauses = comp_q[qs.QRY_NUMCLAUSES]

                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                working_clauses = None
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r
                              if len(r[rdb.DBF_MATCHINGRECORDIDS]) <=
                              qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])]
                if len(clause_q_r) < num_clauses:
                    continue
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                working_clauses = []
                q_refined = False
                for clause_set in itertools.combinations(
                        clause_q_r, num_clauses):
                    #query has already been refined
                    if q_refined == True:
                        continue
                    clause_list = list(clause_set)
                    values = [
                        q[qs.QRY_WHERECLAUSE] for ((q, _), _) in clause_list
                    ]
                    #there are duplicate values or this where has already been seen
                    if len(values)!=len(set(values)) or\
                        values in seen_where_group:
                        continue
                    seen_where_group.append(values)

                    #check conditions
                    matching_ids_set = reduce(set.union, [
                        set(r[rdb.DBF_MATCHINGRECORDIDS])
                        for (_, r) in clause_list
                    ])
                    count = len(matching_ids_set)
                    all_match = sum(
                        map(len, [
                            r[rdb.DBF_MATCHINGRECORDIDS]
                            for (_, r) in clause_list
                        ]))
                    if not all([
                            count >= qbs.get_rss_lower(comp_q[qs.QRY_ENUM]),
                            count <= qbs.get_rss_upper(comp_q[qs.QRY_ENUM]),
                            all_match >= qbs.get_tm_rss_lower(
                                comp_q[qs.QRY_ENUM]), all_match <=
                            qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #this combination worked, so don't need to refine further for this
                    #particular query
                    q_refined = True
                    refined_total += 1
                    working_clauses = clause_list
                    #update query with chosen clauses
                    whereclauses = [
                        q[qs.QRY_WHERECLAUSE]
                        for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_WHERECLAUSE] = " OR ".join(whereclauses)
                    comp_q['sub_queries'] = [
                        q for ((q, _), _) in working_clauses
                    ]
                    comp_q[qs.QRY_SUBBOBS] = [
                        b for ((_, b), _) in working_clauses
                    ]

                    ftm_match = len(
                        working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])

                    comp_q_results[qs.QRY_SUBRESULTS] = [
                        r for (_, r) in working_clauses
                    ]
                    comp_q_results[
                        rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                    comp_q_results[
                        qs.QRY_SUMRECORDSMATCHINGEACHTERM] = all_match
                    comp_q_results[
                        qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match

                    #make sure duplicate queries (and their atomic sub_components) have the same qids
                    comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                        comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                    comp_q_results[qs.QRY_QID] = q[qs.QRY_QID]
                    for (sub_q,
                         sub_r) in zip(comp_q['sub_queries'],
                                       comp_q_results[qs.QRY_SUBRESULTS]):
                        sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                            sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                        sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                    #create result objects and write to ground truth database
                    qr.QueryResultBase.write_to_full_to_atomic_table(
                        comp_q, comp_q_results, db_object)
                    qr.QueryResultBase.write_to_full_table(
                        comp_q, comp_q_results, db_object)
                    comp_q[qs.QRY_SUBBOBS][0].process_results(
                        None, db_object, query_file_handle,
                        zip(comp_q['sub_queries'],
                            comp_q_results[qs.QRY_SUBRESULTS]))
                    refined_queries.append((comp_q, comp_q_results))

                    #print query
                    self._print_query(comp_q, query_file_handle)

                #make where clause, update and with chosen queries and the aggregator results
                #with the chosen results
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, sftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], all_match, count))

            self.refined_queries_results = refined_queries
Ejemplo n.º 10
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries:
            self.refined_queries_results = refined_queries
        else:
            refined_queries = []
            refined_total = 0
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                try:
                    num_clauses = comp_q[qs.QRY_NUMCLAUSES]
                except KeyError:
                    num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                working_clauses = None
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries
                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)
                clause_q_r = [((q, b), r) for ((q, b), r) in clause_q_r
                              if r[qs.QRY_VALID]]
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                working_clauses = []
                q_refined = False
                for clause in clause_q_r:
                    for clause_set in itertools.combinations(
                            clause_q_r, num_clauses - 1):
                        if q_refined == True:
                            continue
                        clause_list = [clause] + list(clause_set)
                        values = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in clause_list
                        ]
                        if len(values) != len(
                                set(values)) or values in seen_where_group:
                            continue
                        seen_where_group.append(values)
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in clause_list
                        ])
                        count = len(matching_ids_set)
                        P2_cats = [
                            q for ((q, _), _) in clause_list
                            if q[qs.QRY_CAT] == 'P2'
                        ]
                        if not all([
                                count >= qbs.get_rss_lower(
                                    comp_q[qs.QRY_ENUM]), count <=
                                qbs.get_rss_upper(comp_q[qs.QRY_ENUM]),
                                len(P2_cats) <= 1
                        ]):
                            continue

                        #this combination worked, so don't need to refine further for this
                        #particular query
                        q_refined = True
                        refined_total += 1
                        working_clauses = clause_list
                        #reorder clauses
                        re_ordered_clauses = []
                        last_clause = None
                        for ((q, b), r) in working_clauses:
                            if q[qs.QRY_CAT] == 'P2':
                                last_clause = ((q, b), r)
                            else:
                                re_ordered_clauses.append(((q, b), r))
                        if last_clause:
                            re_ordered_clauses.append(last_clause)

                        working_clauses = re_ordered_clauses

                        #update query with chosen clauses
                        whereclauses = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses)
                        comp_q['sub_queries'] = [
                            q for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_SUBBOBS] = [
                            b for ((_, b), _) in working_clauses
                        ]
                        ftm_match = len(
                            working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in working_clauses
                        ])
                        comp_q_results[qs.QRY_SUBRESULTS] = [
                            r for (_, r) in working_clauses
                        ]
                        comp_q_results[
                            rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                        comp_q_results[
                            qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match

                        refined_queries.append((comp_q, comp_q_results))

                #make where clause, update and with chosen queries and the aggregator results
                #with the chosen results
                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION where_clause = %s, ftm = %d, rss = %d"
                        % (comp_q[qs.QRY_WHERECLAUSE], ftm_match, count))

            for (q, r) in refined_queries:
                q[qs.QRY_QID] = qids.full_where_has_been_seen(
                    q[qs.QRY_QID], q[qs.QRY_WHERECLAUSE])
                r[qs.QRY_QID] = q[qs.QRY_QID]
                for (sub_q, sub_r) in zip(q['sub_queries'],
                                          r[qs.QRY_SUBRESULTS]):
                    sub_q[qs.QRY_QID] = qids.atomic_where_has_been_seen(
                        sub_q[qs.QRY_QID], sub_q[qs.QRY_WHERECLAUSE])
                    sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

            #capping at choose-num number of queries
            self.refined_queries_results = refined_queries
            #create result objects and write to ground truth database
        for (q, r) in self.refined_queries_results:
            qr.QueryResultBase.write_to_full_to_atomic_table(q, r, db_object)
            qr.QueryResultBase.write_to_full_table(q, r, db_object)
            q[qs.QRY_SUBBOBS][0].process_results(
                None, db_object, query_file_handle,
                zip(q['sub_queries'], r[qs.QRY_SUBRESULTS]))
        #writing queries in sql format to file
        for (q, _) in self.refined_queries_results:
            if q != None:
                self._print_query(q, query_file_handle)
Ejemplo n.º 11
0
    def process_results(self,
                        agg_results,
                        db_object,
                        query_file_handle,
                        refined_queries=None):
        """
        Takes in the aggregator results, with those results, determines
        which queries in the batch are 'interesting' it then instantiates
        query_results for those queries and uses it to write it to the 
        results database. 
        
        Refine arguement is a list of already refined queries if the user 
        does not wish to rely on the pre-defined refine queries function
        """
        #refine queries if not already refined.
        if refined_queries:
            self.refined_queries_results = refined_queries
            for (comp_q, comp_q_results) in self.refined_queries_results:
                qr.QueryResultBase.write_to_full_to_atomic_table(
                    comp_q, comp_q_results, db_object)
                qr.QueryResultBase.write_to_full_table(comp_q, comp_q_results,
                                                       db_object)
                comp_q[qs.QRY_SUBBOBS][0].process_results(
                    None, db_object, query_file_handle,
                    zip(comp_q['sub_queries'],
                        comp_q_results[qs.QRY_SUBRESULTS]))
                #print out the query
                self._print_query(comp_q, query_file_handle)
        else:
            refined_total = 0
            refined_queries = []
            for x in xrange(len(self.queries)):
                comp_q = self.queries[x]
                sub_results = agg_results[qs.QRY_SUBRESULTS]
                try:
                    num_clauses = comp_q[qs.QRY_NUMCLAUSES]
                except KeyError:
                    num_clauses = comp_q[qs.QRY_N]
                sub_bobs = comp_q[qs.QRY_SUBBOBS]
                clause_q_b = []
                working_clauses = None
                #create the list of possible queries that can make up the clauses
                #(they are also paired with the bobs that create them)
                for b in sub_bobs:
                    clause_q = b.produce_queries()
                    clause_q_b += [(q, b) for q in clause_q]
                clause_r = []
                #create list of results that go with those queries

                for (q, _) in clause_q_b:
                    clause_r.append(sub_results[self.result_to_agg_map[q[
                        qs.QRY_WHERECLAUSE]]])
                comp_q_results = {qs.QRY_SUBRESULTS: clause_r}
                #create a list of queries, their bobs, and their results
                clause_q_r = zip(clause_q_b, clause_r)

                clause_q_r = sorted(
                    clause_q_r,
                    key=lambda ((q, b), r): len(r[rdb.DBF_MATCHINGRECORDIDS]))
                #try all possible cominbations of the queries to test if any
                #have the correct combinations to match the required ftm and ress
                seen_where_group = []
                working_clauses = []
                q_refined = False
                for clause in clause_q_r:
                    #don't need to check permuations if ftm doesn't match
                    if q_refined == True:
                        continue
                    ftm_match = len(clause[1][rdb.DBF_MATCHINGRECORDIDS])
                    if not all([
                            ftm_match >= qbs.get_tm_rss_lower(
                                comp_q[qs.QRY_ENUM]), ftm_match <=
                            qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                    ]):
                        continue
                    #alright ftm matches, let's check the rest of the clauses
                    for clause_set in itertools.combinations(
                            clause_q_r, num_clauses - 1):
                        #query has already been refined
                        if q_refined == True:
                            continue
                        clause_list = [clause] + list(clause_set)
                        #check to see if any of the clauses or their fields are the same
                        #if so we know the intersection is one we are not interested in
                        values = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in clause_list
                        ]
                        fields = [
                            q[qs.QRY_FIELD] for ((q, _), _) in clause_list
                        ]
                        #there are duplicate values or this where has already been seen
                        if len(values)!=len(set(values)) or\
                           len(fields)!=len(set(fields)) or\
                            values in seen_where_group:
                            continue
                        seen_where_group.append(values)

                        #check conditions
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in clause_list
                        ])
                        count = len(matching_ids_set)

                        if not all([
                                count >= qbs.get_rss_lower(
                                    comp_q[qs.QRY_ENUM]), count <=
                                qbs.get_rss_upper(comp_q[qs.QRY_ENUM]),
                                ftm_match >= qbs.get_tm_rss_lower(
                                    comp_q[qs.QRY_ENUM]), ftm_match <=
                                qbs.get_tm_rss_upper(comp_q[qs.QRY_ENUM])
                        ]):
                            continue

                        #this combination worked, so don't need to refine further for this
                        #particular query
                        q_refined = True
                        refined_total += 1
                        #reorder clauses
                        working_clauses = clause_list
                        reordered_clauses = working_clauses[:1]
                        working_clauses.remove(reordered_clauses[0])
                        cumulative_set = set(
                            reordered_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])
                        while len(working_clauses) > 0:
                            next_clause = working_clauses[0]
                            current_set = cumulative_set.intersection(
                                working_clauses[0][1][
                                    rdb.DBF_MATCHINGRECORDIDS])
                            for clauses in working_clauses:
                                potential_set = cumulative_set.intersection(
                                    clauses[1][rdb.DBF_MATCHINGRECORDIDS])
                                if len(potential_set) < len(current_set):
                                    next_clause = clauses
                                    current_set = potential_set
                            working_clauses.remove(next_clause)
                            reordered_clauses.append(next_clause)
                            cumulative_set = current_set

                        working_clauses = reordered_clauses

                        #update query with chosen clauses
                        whereclauses = [
                            q[qs.QRY_WHERECLAUSE]
                            for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_WHERECLAUSE] = " AND ".join(whereclauses)
                        comp_q['sub_queries'] = [
                            q for ((q, _), _) in working_clauses
                        ]
                        comp_q[qs.QRY_SUBBOBS] = [
                            b for ((_, b), _) in working_clauses
                        ]

                        ftm_match = len(
                            working_clauses[0][1][rdb.DBF_MATCHINGRECORDIDS])
                        matching_ids_set = reduce(set.intersection, [
                            set(r[rdb.DBF_MATCHINGRECORDIDS])
                            for (_, r) in working_clauses
                        ])
                        comp_q_results[qs.QRY_SUBRESULTS] = [
                            r for (_, r) in working_clauses
                        ]
                        comp_q_results[
                            rdb.DBF_MATCHINGRECORDIDS] = matching_ids_set
                        comp_q_results[
                            qs.QRY_NUMRECORDSMATCHINGFIRSTTERM] = ftm_match

                        #get the id's lined up
                        comp_q[qs.QRY_QID] = qids.full_where_has_been_seen(
                            comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE])
                        comp_q_results[qs.QRY_QID] = comp_q[qs.QRY_QID]
                        for (sub_q,
                             sub_r) in zip(comp_q['sub_queries'],
                                           comp_q_results[qs.QRY_SUBRESULTS]):
                            sub_q[
                                qs.QRY_QID] = qids.atomic_where_has_been_seen(
                                    sub_q[qs.QRY_QID],
                                    sub_q[qs.QRY_WHERECLAUSE])
                            sub_r[qs.QRY_QID] = sub_q[qs.QRY_QID]

                        #write the results to the results database
                        qr.QueryResultBase.write_to_full_to_atomic_table(
                            comp_q, comp_q_results, db_object)
                        qr.QueryResultBase.write_to_full_table(
                            comp_q, comp_q_results, db_object)
                        comp_q[qs.QRY_SUBBOBS][0].process_results(
                            None, db_object, query_file_handle,
                            zip(comp_q['sub_queries'],
                                comp_q_results[qs.QRY_SUBRESULTS]))
                        #print out the query
                        self._print_query(comp_q, query_file_handle)
                        refined_queries.append((comp_q, comp_q_results))

                logger.info("FINISHED QUERY %d of %d, TOTAL THAT WORK %d" %
                            (x, len(self.queries), refined_total))
                if q_refined == True:
                    logger.info(
                        "WORKING QUERY INFORMATION qid = %d, where_clause = %s, ftm = %d, rss = %d"
                        % (comp_q[qs.QRY_QID], comp_q[qs.QRY_WHERECLAUSE],
                           ftm_match, count))

            #capping at choose-num number of queries
            self.refined_queries_results = refined_queries