def refine_queries(self, agg_result): """ Takes in 'agg_result' which is the result from the aggregator for this BOQ. Selects which queries should be recorded in the results database. To do this it creates a new list of associated selected queries and pairs them with their results. """ #selecting queries that match. queries = [] assert len(self.queries) == len(agg_result[qs.QRY_SUBRESULTS]) for q, r in zip(self.queries, agg_result[qs.QRY_SUBRESULTS]): assert q assert r assert q[qs.QRY_QID] >= r[qs.QRY_QID] potential_queries = [] for (value, value_result) in r[qs.QRY_FISHING_MATCHES_FOUND].iteritems(): count = len(value_result) if qbs.get_rss_lower(q[qs.QRY_ENUM]) <= count and\ qbs.get_rss_upper(q[qs.QRY_ENUM]) >= count: (value, where) = self.format_value_and_where( sv.sql_name_to_enum(q[qs.QRY_FIELD]), value) q[qs.QRY_VALUE] = value q[qs.QRY_WHERECLAUSE] = where r[rdb.DBF_MATCHINGRECORDIDS] = value_result potential_queries.append((q, r)) if potential_queries: chosen_q = random.sample(potential_queries, 1)[0] chosen_q[0][qs.QRY_QID] = \ qids.full_where_has_been_seen(chosen_q[0][qs.QRY_QID], chosen_q[0][qs.QRY_WHERECLAUSE]) queries.append(chosen_q) #capping at choose-num number of queries self.refined_queries_results = queries
def setUp(self): self.seed = int(time.time()) self.seed_msg = "Random seed used for this test: %s" % self.seed self.longMessage = True spar_random.seed(self.seed) #set up intitialization values self.dummy_logger = logging.getLogger('dummy') self.dummy_logger.addHandler(logging.NullHandler()) class Options(object): pass learner_options = Options() learner_options.verbose = False pums_files = \ [("mock pums", stringio.StringIO(mock_data_files.mock_pums_data))] pums_dict = \ learn_distributions.learn_pums_dists(learner_options, self.dummy_logger, pums_files) names_files = \ [('male_first_names.txt', stringio.StringIO(mock_data_files.mock_male_first_names)), ('female_first_names.txt', stringio.StringIO(mock_data_files.mock_female_first_names)), ('last_names.txt', stringio.StringIO(mock_data_files.mock_last_names))] names_dict = \ learn_distributions.learn_name_dists(learner_options, self.dummy_logger, names_files) vars = [sv.VARS.SEX, sv.VARS.CITIZENSHIP, sv.VARS.AGE, sv.VARS.RACE, sv.VARS.STATE, sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME] var_order = vars var_names = [sv.VARS.to_string(x) for x in vars] dist_dict = { } dist_dict.update(pums_dict) dist_dict.update(names_dict) dist_holder = dh.DistributionHolder(var_order, var_names, dist_dict) fields = [sv.sql_name_to_enum('xml')] self._dist1 = xml_generator.XmlGenerator(dist_holder) dists = [self._dist1] other_fields = ['no_queries', 'r_lower', 'r_upper', 'path_type'] other_cols_full = [[5, 1, 10, 'full']] other_cols_short = [[5,1, 10,'short']] self.full_generator = xqg.XmlQueryGenerator('P11','', ["LL"],dists, fields, 1000, 100,other_fields, other_cols_full) self.short_generator = xqg.XmlQueryGenerator('P11','', ["LL"],dists, fields, 1000, 100,other_fields, other_cols_short)
def __init__(self, query): self._qid = query[qs.QRY_QID] self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD]) self._alarmwords = set([query[qs.QRY_ALARMWORDONE], query[qs.QRY_ALARMWORDTWO]]) self._alarmword_distance = query[qs.QRY_ALARMWORDDISTANCE] self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM]) self._count = 0
def _generate_short_queries(self, dist, q): ''' Generates queries of the form .//LEAF ''' query_dicts = [] query_count = 0 for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 query_cout = count LOGGER.info('P11: Created %d out of %d queries' % \ (self.__count, self.__total)) r_lower = q[qs.QRY_LRSS] / (self.__db_size * xg.XML_DEPTH * xg.FAN_OUT) r_upper = q[qs.QRY_URSS] / (self.__db_size * xg.XML_DEPTH * xg.FAN_OUT) (field, value) = self._create_equality_leaf(dist, r_lower, r_upper) value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value) try: value = value.replace('\'', '\'\'') except TypeError: pass except AttributeError: pass if field in ['foo', 'age', 'income']: where = "xml_value(xml,\'//%s\', %s)" % (field, value) else: where = "xml_value(xml,\'//%s\', \'%s\')" % (field, value) xpath = field qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P11_SHORT, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_PERF: self.__perf, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'eq-double-slash', qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: 'string', qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_VALUE: value, qs.QRY_XPATH: xpath }) return aqb.XmlQueryBatch( query_dicts, query_count, max(int((query_count + 1) / OVER_GENERATION_RATIO), 1), True)
def setUp(self): self.seed = int(time.time()) self.seed_msg = "Random seed used for this test: %s" % self.seed self.longMessage = True spar_random.seed(self.seed) #set up intitialization values sub_cat = 'foo-range' self._foo_dist = bespoke_distribution.FooDistribution() fields = [sv.sql_name_to_enum('foo')] dists = [self._foo_dist] other_fields = ['no_queries', 'r_lower', 'r_upper','r_exp_lower','r_exp_upper','type'] other_cols = [[2, 1, 100, 21, 21, 'range'], [2,1, 100,32, 32,'range'], [2, 1, 200, 21, 21,'greater'],[2,1, 200,25, 25,'greater']] self.generator = frqg.FooRangeQueryGenerator('P2',sub_cat, ["LL"],dists, fields, 50000, 100,other_fields, other_cols)
def __init__(self, query): ''' Initialize the needed class variables from the query ''' self._qid = query[qs.QRY_QID] self._field = sv.sql_name_to_enum(query[qs.QRY_FIELD]) # try/except block is mostly for backwards compatability # with unit tests try: self._process_cutoff = qbs.get_rss_upper(query[qs.QRY_ENUM]) except KeyError: self._process_cutoff = 100000 self._count = 0 #If the query is atomic (i.e. top level), we want to apply a limit #on what it can collect, otherwise we want no process limit in #effect try: self._top_level = query['top_level'] except KeyError: self._top_level = True
def testGenerateQuery(self): """ Tests equality query generator against a 'db' to make sure it is generating the right queries """ #generate a 'db' to test against rows = [] for _ in xrange(1000): row_dict = {} for var in self.fields_to_gen: dist = self.dist_holder.dist_dict[var] v = dist.generate(row_dict) if var != sv.VARS.DOB: row_dict[var] = sv.VAR_CONVERTERS[var].to_csv(v) else: row_dict[var] = v rows.append(row_dict) #generate queries query_batches = self.generator.produce_query_batches() queries = [] for query_batch in query_batches: queries += query_batch.produce_queries() #check queries against 'db' to make sure they match within a factor #of two count = 0 fail_msg = '' for q in queries: if count % 3 == 0: fail_count = 0 count += 1 if q[qs.QRY_SUBCAT] == 'range': minin = q[qs.QRY_LBOUND] max = q[qs.QRY_UBOUND] val = (minin, max) x = lambda y: y >= minin and y <= max elif q[qs.QRY_SUBCAT] == 'greater': val = q[qs.QRY_VALUE] x = lambda y: y >= val val = str(val) else: val = q[qs.QRY_VALUE] x = lambda y: y <= val val = str(val) count_match = len([ row for row in rows if x(row[sv.sql_name_to_enum(q[qs.QRY_FIELD])]) ]) msg = 'Query %d was: \n' \ 'sub_cat: %s\n'\ 'field: %s\n'\ 'type: %s\n'\ 'r_lower: %d\n'\ 'r_upper: %d\n'\ 'count: %d\n'\ 'value: %s\n' % (q[qs.QRY_QID], q[qs.QRY_SUBCAT], q[qs.QRY_FIELD], q[qs.QRY_SUBCAT], q[qs.QRY_LRSS], q[qs.QRY_URSS], count, val) if count_match > q[qs.QRY_URSS] * 10 or count_match < q[ qs.QRY_LRSS] / 10: fail_count += 1 fail_msg = msg self.assertLessEqual(fail_count, 6, fail_msg)
def setUp(self): self.seed = int(time.time()) self.seed_msg = "Random seed used for this test: %s" % self.seed self.longMessage = True spar_random.seed(self.seed) #set up intitialization values class Object(object): pass self.learner_options = Object() self.dummy_logger = logging.getLogger('dummy') self.dummy_logger.addHandler(logging.NullHandler()) self.dummy_object = Object() pums_files = \ [("mock pums", stringio.StringIO(mock_data_files.mock_pums_data))] pums_dict = \ learn_distributions.learn_pums_dists(self.learner_options, self.dummy_logger, pums_files) names_files = \ [('male_first_names.txt', stringio.StringIO(mock_data_files.mock_male_first_names)), ('female_first_names.txt', stringio.StringIO(mock_data_files.mock_female_first_names)), ('last_names.txt', stringio.StringIO(mock_data_files.mock_last_names))] names_dict = \ learn_distributions.learn_name_dists(self.learner_options, self.dummy_logger, names_files) zipcode_files = \ [('mock_zipcodes', stringio.StringIO(mock_data_files.mock_zipcodes))] zipcode_dict = \ learn_distributions.learn_zipcode_dists(self.learner_options, self.dummy_logger, zipcode_files) text_files = \ [('mock_text', stringio.StringIO(mock_data_files.mock_text_files))] text_engine = \ learn_distributions.train_text_engine(self.learner_options, self.dummy_logger, text_files) streets_files = \ [('mock street file', stringio.StringIO(mock_data_files.mock_street_names))] address_dict = \ learn_distributions.learn_street_address_dists(self.learner_options, self.dummy_logger, streets_files) self.dist_holder = \ learn_distributions.make_distribution_holder(self.learner_options, self.dummy_logger, pums_dict, names_dict, zipcode_dict, address_dict, text_engine) self.fields_to_gen = [ sv.VARS.SEX, sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME ] sub_cat = 'eq' fields = [sv.sql_name_to_enum('fname'), sv.sql_name_to_enum('lname')] dists1 = [ self.dist_holder.dist_dict[sv.VARS.FIRST_NAME], self.dist_holder.dist_dict[sv.VARS.LAST_NAME] ] other_fields = ['no_queries', 'r_lower', 'r_upper'] other_cols = [[5, 1, 10], [5, 11, 100]] self.generator = eqg.EqualityQueryGenerator('EQ', sub_cat, ["LL"], dists1, fields, 1000, 100, other_fields, other_cols)
def testGenerateQuery(self): """ Tests equality query generator against a 'db' to make sure it is generating the right queries """ #generate a 'db' to test against rows = [] for _ in xrange(1000): row_dict = {} for var in self.fields_to_gen: dist = self.dist_holder.dist_dict[var] v = dist.generate(row_dict) row_dict[var] = v rows.append(row_dict) #generate queries query_batches = self.generator.produce_query_batches() query = [] for query_batch in query_batches: query += query_batch.produce_queries() q_dist1 = 0 q_dist2 = 0 #check queries against 'db' to make sure they match within a factor #of two working_queries = 0 non_working_queries = [] for q in query: self.assertEqual('', q[qs.QRY_SUBCAT], self.seed_msg) if q[qs.QRY_FIELD] == 'fname': q_dist1 += 1 elif q[qs.QRY_FIELD] == 'lname': q_dist2 += 1 count_match = len([ x for x in rows if x[sv.sql_name_to_enum(q[qs.QRY_FIELD])] == q[qs.QRY_VALUE] ]) msg = 'Query %d was: \n' \ 'sub_cat: %s\n'\ 'field: %s\n'\ 'r_lower: %d\n'\ 'r_upper: %d\n'\ 'value: %s\n' % (q[qs.QRY_QID], q[qs.QRY_SUBCAT], q[qs.QRY_FIELD], q[qs.QRY_LRSS], q[qs.QRY_URSS], q[qs.QRY_VALUE]) if count_match <= q[qs.QRY_URSS] * 2 and count_match >= q[ qs.QRY_LRSS] / 2: working_queries += 1 else: non_working_queries.append(msg) fail_msg = '' for msg in non_working_queries[:3]: fail_msg += msg self.assertGreaterEqual(working_queries, 10, fail_msg) #check to see each field had the correct number of queries #ideally this number would be greater than 5 (the requested amount) #but because the distribution used for unit testing is so small #there is a greater margin of error at this scale self.assertGreaterEqual(q_dist1, 4, self.seed_msg) self.assertGreaterEqual(q_dist2, 4, self.seed_msg)
def testGenerateQuery(self): """ Tests threshold query generator against a 'db' to make sure it is generating the right queries """ #generate a 'db' to test against rows = [] for x in xrange(1000): row_dict = {sv.VARS.ID: x} for var in self.fields_to_gen: dist = self.dist_holder.dist_dict[var] v = dist.generate(row_dict) row_dict[var] = sv.VAR_CONVERTERS[var].to_agg_fmt(v) rows.append(row_dict) #generate queries query_batches = self.generator.produce_query_batches() query_value_sets = [] for query_batch in query_batches: queries = query_batch.produce_queries() for query in queries: for (a, b, c) in itertools.permutations(range(6), 3): query_value_sets.append({ 'first_clause': query['sub_queries'][0][a][qs.QRY_VALUE], 'first_clause_field': query['sub_queries'][0][a][qs.QRY_FIELD], 'second_clause': query['sub_queries'][0][b][qs.QRY_VALUE], 'second_clause_field': query['sub_queries'][0][b][qs.QRY_FIELD], 'third_clause': query['sub_queries'][0][c][qs.QRY_VALUE], 'third_clause_field': query['sub_queries'][0][c][qs.QRY_FIELD], 'r_lower': query['r_lower'], 'r_upper': query['r_upper'], 'sftm_lower': query['ftm_lower'], 'sftm_upper': query['ftm_upper'] }) #check to see right number of queries generated self.assertEqual(len(query_value_sets), 2400, self.seed_msg) #check queries against 'db' to make sure they match within a factor #of two working_queries = 0 non_working_queries = [] id = sv.VARS.ID for q in query_value_sets: first_field = sv.sql_name_to_enum(q['first_clause_field']) second_field = sv.sql_name_to_enum(q['second_clause_field']) third_field = sv.sql_name_to_enum(q['third_clause_field']) ft = [x[id] for x in rows if x[first_field] == q['first_clause']] st = [x[id] for x in rows if x[second_field] == q['second_clause']] tt = [x[id] for x in rows if x[third_field] == q['third_clause']] matching_ids_set = set() for m_set in itertools.combinations([ft, st, tt], 2): matching_ids_set.update( reduce(set.intersection, [set(x) for x in m_set])) count_match = len(matching_ids_set) total_match = len(ft) + len(st) msg = 'Query was\n'\ 'where: %s=%s AND %s=%s\n'\ 'ftm_lower: %d\n'\ 'ftm_lower: %d\n'\ 'r_lower: %d\n'\ 'r_upper: %d\n'\ 'sftm_match: %d\n'\ 'count_match: %d\n'\ '\n' % (q['first_clause_field'], q['first_clause'], q['second_clause_field'], q['second_clause'], q['sftm_lower'], q['sftm_upper'], q['r_lower'], q['r_upper'], total_match,count_match) if count_match <= q[qs.QRY_URSS]*2 and count_match >= q[qs.QRY_LRSS]/2 and\ total_match <= q['sftm_upper']*2 and total_match >= q['sftm_lower']/2: working_queries += 1 else: non_working_queries.append(msg) fail_msg = '' for msg in non_working_queries[:3]: fail_msg += msg self.assertGreaterEqual(working_queries, 10, fail_msg)
def testGenerateQuery(self): """ Tests or query generator against a 'db' to make sure it is generating the right queries """ #generate a 'db' to test against rows = [] for _ in xrange(1000): row_dict = {} for var in self.fields_to_gen: dist = self.dist_holder.dist_dict[var] v = dist.generate(row_dict) row_dict[var] = sv.VAR_CONVERTERS[var].to_agg_fmt(v) rows.append(row_dict) #generate queries query_batches = self.generator.produce_query_batches() query_value_sets = [] for query_batch in query_batches: queries = query_batch.produce_queries() for query in queries: for (a, b) in itertools.permutations(range(6), 2): query_value_sets.append({ 'first_clause': query['sub_queries'][0][a][qs.QRY_VALUE], 'first_clause_field': query['sub_queries'][0][a][qs.QRY_FIELD], 'second_clause': query['sub_queries'][0][b][qs.QRY_VALUE], 'second_clause_field': query['sub_queries'][0][b][qs.QRY_FIELD], 'r_lower': query['r_lower'], 'r_upper': query['r_upper'], 'stm_lower': query['ftm_lower'], 'stm_upper': query['ftm_upper'] }) #check to see right number of queries generated self.assertEqual(len(query_value_sets), 600, self.seed_msg) #check queries against 'db' to make sure they match within a factor #of two working_queries = 0 distribution_ineligible = 0 non_working_queries = [] for q in query_value_sets: first_field = sv.sql_name_to_enum(q['first_clause_field']) second_field = sv.sql_name_to_enum(q['second_clause_field']) ftm_match = len( [x for x in rows if x[first_field] == q['first_clause']]) stm_match = len( [x for x in rows if x[second_field] == q['second_clause']]) total_match = ftm_match + stm_match count_match = len([ x for x in rows if x[first_field] == q['first_clause'] or x[second_field] == q['second_clause'] ]) msg = 'Query was\n'\ 'where: %s=%s AND %s=%s\n'\ 'ftm_lower: %d\n'\ 'ftm_lower: %d\n'\ 'r_lower: %d\n'\ 'r_upper: %d\n'\ 'ftm_match: %d\n'\ 'count_match: %d\n'\ '\n' % (q['first_clause_field'], q['first_clause'], q['second_clause_field'], q['second_clause'], q['stm_lower'], q['stm_upper'], q['r_lower'], q['r_upper'], ftm_match,count_match) if count_match <= q[qs.QRY_URSS]*2 and count_match >= q[qs.QRY_LRSS]/2 and\ total_match <= q['stm_upper']*2 and total_match >= q['stm_lower']/2: working_queries += 1 else: non_working_queries.append(msg) fail_msg = '' for msg in non_working_queries[:3]: fail_msg += msg self.assertGreaterEqual(working_queries, 10, fail_msg)
def testGenerateQueryRanges(self): """ Tests and ta2 query generator against a 'db' to make sure it is generating the right queries does not include a fishing term """ #generate a 'db' to test against rows = [] for _ in xrange(1000): row_dict = {} for var in self.fields_to_gen: dist = self.dist_holder.dist_dict[var] v = dist.generate(row_dict) row_dict[var] = v rows.append(row_dict) #generate queries query_batches = self.generator3.produce_query_batches() query_value_sets = [] for query_batch in query_batches: queries = query_batch.produce_queries() for query in queries: query['sub_queries'] = list(itertools.chain.from_iterable(query['sub_queries'])) for (a, b) in itertools.permutations(range(0,6), 2): try: value = query['sub_queries'][a][qs.QRY_VALUE] lower = query['sub_queries'][b][qs.QRY_LBOUND] upper = query['sub_queries'][b][qs.QRY_UBOUND] except: continue query_value_sets.append({ 'first_clause' : value, 'first_clause_field' : query['sub_queries'][a][qs.QRY_FIELD], 'second_clause_lower' : lower, 'second_clause_upper' : upper, 'second_clause_field' : query['sub_queries'][b][qs.QRY_FIELD], 'r_lower' : query['r_lower'], 'r_upper' : query['r_upper'], 'range_type' : query[qs.QRY_SUBCAT]}) #check to see right number of queries generated self.assertEqual(len(query_value_sets), 90, self.seed_msg) #check queries against 'db' to make sure they match within a factor #of two working_queries = 0 non_working_queries = [] for q in query_value_sets: one_var = sv.sql_name_to_enum(q['first_clause_field']) two_var = sv.sql_name_to_enum(q['second_clause_field']) comp = lambda x,y,z : x >= y and x <= z count_match = len([x for x in rows if sv.VAR_CONVERTERS[one_var].to_agg_fmt(x[sv.sql_name_to_enum( q['first_clause_field'])]) == q['first_clause'].upper() and comp(sv.VAR_CONVERTERS[two_var].to_agg_fmt(x[sv.sql_name_to_enum( q['second_clause_field'])]), q['second_clause_lower'].upper(), q['second_clause_upper'].upper())]) msg = 'Query was\n'\ 'where: %s=%s AND %s BETWEEN %s AND %s\n'\ 'r_lower: %d\n'\ 'r_upper: %d\n'\ 'count_match: %d\n'\ '\n' % (q['first_clause_field'], q['first_clause'], q['second_clause_field'], q['second_clause_lower'], q['second_clause_upper'], q['r_lower'], q['r_upper'],count_match) if count_match <= q[qs.QRY_URSS]*2 and count_match >= q[qs.QRY_LRSS]/2: working_queries+=1 else: non_working_queries.append(msg) fail_msg = '' for msg in non_working_queries[:3]: fail_msg += msg self.assertGreaterEqual(working_queries, 10, fail_msg)
def _generate_full_queries(self, dist, q): ''' Generates queries of the form ./node1/node2/LEAF ''' query_dicts = [] for count in xrange(q['no_queries'] * OVER_GENERATION_RATIO): self.__count += 1 LOGGER.info('P11: Created %d out of %d queries' % \ (self.__count, self.__total)) r_lower_total = q[qs.QRY_LRSS] / self.__db_size r_upper_total = q[qs.QRY_URSS] / self.__db_size branch_r_lower = pow(r_lower_total / xg.XML_DEPTH, 1.0 / (xg.XML_DEPTH)) branch_r_upper = pow(r_upper_total / xg.XML_DEPTH, 1.0 / (xg.XML_DEPTH)) tags = [] for level in xrange(xg.XML_DEPTH - 1): tags.append( dist.generate_node_pdf(level, branch_r_lower, branch_r_upper)) tag_string = '' for tag in tags: tag_string += "/%s" % (tag) (field, value) = self._create_equality_leaf(dist, branch_r_lower, branch_r_upper) value = sv.VAR_CONVERTERS[sv.sql_name_to_enum(field)].to_csv(value) try: value = value.replace('\'', '\'\'') except TypeError: pass except AttributeError: pass if field in ['foo', 'age', 'income']: where = "xml_value(xml,\'/xml%s/%s\',%s)" % (tag_string, field, value) else: where = "xml_value(xml,\'/xml%s/%s\',\'%s\')" % (tag_string, field, value) xpath = ['xml'] + tags xpath.append(field) qid = qids.query_id() if qid != qids.full_where_has_been_seen(qid, where): continue query_dicts.append({ qs.QRY_ENUM: qs.CAT.P11_FULL, qs.QRY_QID: qid, qs.QRY_DBNUMRECORDS: self.__db_size, qs.QRY_DBRECORDSIZE: self.__row_width, qs.QRY_CAT: self.__cat, qs.QRY_SUBCAT: 'eq-full', qs.QRY_PERF: self.__perf, qs.QRY_WHERECLAUSE: where, qs.QRY_FIELD: sv.sql_info[sv.VARS.XML][0], qs.QRY_NEGATE: False, qs.QRY_FIELDTYPE: 'string', qs.QRY_LRSS: q[qs.QRY_LRSS], qs.QRY_URSS: q[qs.QRY_URSS], qs.QRY_VALUE: value, qs.QRY_XPATH: xpath }) return aqb.XmlQueryBatch( query_dicts, count, max(int((count + 1) / OVER_GENERATION_RATIO), 1), True)
def setUp(self): self.seed = int(time.time()) self.seed_msg = "Random seed used for this test: %s" % self.seed self.longMessage = True spar_random.seed(self.seed) #set up intitialization values sub_cat = 'eq' self._dist1 = base_distribution.CompactIndependentDistribution() self._dist1.add('Letus', 1) self._dist1.add('arbey', 9) self._dist1.add('Amelia', 1) self._dist1.add('Anfrew', 9) self._dist1.add('Roberts', 1) self._dist1.add('Andreas', 9) self._dist1.add('Vacation', 1) self._dist1.add('Occulary', 9) self._dist1.add('Fuzzballs', 1) self._dist1.add('Divasmuch', 9) self._dist1.add('tenletters', 1) self._dist1.add('arehardtoo', 9) self._dist1.add('elevenseven', 1) self._dist1.add('harderthant', 9) self._dist2 = base_distribution.CompactIndependentDistribution() self._dist2.add('Smith', 1) self._dist2.add('Henry', 9) self._dist2.add('Roberts', 1) self._dist2.add('Andreas', 9) self._dist2.add('Vacation', 1) self._dist2.add('Occulary', 9) self._dist2.add('Fuzzballs', 1) self._dist2.add('Divasmuch', 9) self._dist2.add('tenletters', 1) self._dist2.add('arehardtoo', 9) self._dist2.add('elevenseven', 1) self._dist2.add('harderthant', 9) f = s.StringIO('''A a b my spout, when you tip me over hear me a a b. C cc e I'm a little teacup short and stout, here is my c cc e. F fff h something about something about something of something f fff h. I jjjj l sentence generation is hard and annoying when doing i jjjj l. M nnnnn p lets try to include more original sentenctes what m nnnnn p. Q rrrrrr s interesting trial of things and other things that q rrrrrr s. U vvvvvvv y and need to include some other stuff to make u vvvvvvv y.''' ) self._dist3 = text_generator.TextGenerator((f, )) fields_non_notes = [ sv.sql_name_to_enum('fname'), sv.sql_name_to_enum('lname') ] fields_notes = [sv.sql_name_to_enum('notes1')] dists = [self._dist1, self._dist2] other_fields = [ 'no_queries', 'r_lower', 'r_upper', 'keyword_len', 'type' ] other_cols_P6 = [[2, 1, 100, 5, 'initial-one'], [2, 1, 100, 9, 'middle-one'], [2, 1, 100, 5, 'final-one']] other_cols_P7 = [[2, 1, 200, 5, 'initial'], [2, 1, 250, 5, 'both'], [2, 1, 200, 5, 'final']] self.P6_non_notes_generator = wqg.WildcardQueryGenerator( 'P6', '', ["LL"], dists, fields_non_notes, 1000, 100, other_fields, other_cols_P6) self.P7_non_notes_generator = wqg.WildcardQueryGenerator( 'P7', '', ["LL"], dists, fields_non_notes, 1000, 100, other_fields, other_cols_P7) self.P7_notes_generator = wqg.WildcardQueryGenerator( 'P7', '', ["LL"], [self._dist3], fields_notes, 1000, 100, other_fields, other_cols_P7)