def keyword_schema_weights(keyword, kwd_idx=-1): """ for each schema term (entity, entity attribute) calculates likelihood for the keyword to match it """ entities = get_schema().entity_names.items() result = [(string_distance(keyword, entity_short), entity_long) for entity_long, entity_short in entities] # check synonyms entity_synonyms = get_schema().cms_synonyms['daskeys'].items() for entity_long, synonyms in entity_synonyms: for synonym in synonyms: result.extend([ (string_distance(keyword, synonym), entity_long), ]) # apply some simple patterns if kwd_idx == 0: if keyword == 'where': result.extend([ (0.75, 'site.name'), ]) if keyword == 'who': result.extend([ (0.5, 'user.name'), ]) result = [item for item in result if item[0] > 0] result.sort(key=lambda item: item[0], reverse=True) return result
def keyword_schema_weights(keyword, kwd_idx=-1): """ for each schema term (entity, entity attribute) calculates likelihood for the keyword to match it """ entities = get_schema().entity_names.iteritems() result = [(string_distance(keyword, entity_short), entity_long) for entity_long, entity_short in entities] # check synonyms entity_synonyms = get_schema().cms_synonyms['daskeys'].iteritems() for entity_long, synonyms in entity_synonyms: for synonym in synonyms: result.extend([(string_distance(keyword, synonym), entity_long), ]) # apply some simple patterns if kwd_idx == 0: if keyword == 'where': result.extend([(0.75, 'site.name'), ]) if keyword == 'who': result.extend([(0.5, 'user.name'), ]) result = [item for item in result if item[0] > 0] result.sort(key=lambda item: item[0], reverse=True) return result
def keyword_regexp_weights(keyword): """ evaluate keyword regexp matches """ regexps = get_schema().compiled_input_regexps for re_compiled, constraint, apis in regexps: # do not allow # in dataset if '#' in keyword: apis = [api for api in apis if api['key'] != 'dataset'] if not apis: continue score = 0 # We shall prefer non empty constraints # We may also have different weights for different types of regexps if re.search(re_compiled, keyword): if constraint.startswith('^') and constraint.endswith('$'): score = 0.7 elif constraint.startswith('^') or constraint.endswith('$'): score = 0.6 elif constraint != '': score = 0.5 if score: for api in apis: yield score, api['entity_long'] # append date match... if regex.date_yyyymmdd_pattern.match(keyword): yield 0.95, 'date'
def keyword_regexp_weights(keyword): """ evaluate keyword regexp matches """ regexps = get_schema().compiled_input_regexps for re_compiled, constraint, apis in regexps: # do not allow # in dataset if '#' in keyword: apis = [api for api in apis if api['key'] != 'dataset'] if not apis: continue score = 0 # We shall prefer non empty constraints # We may also have different weights for different types of regexps if re.search(re_compiled, keyword): if constraint.startswith('^') and constraint.endswith('$'): score = 0.7 elif constraint.startswith('^') or constraint.endswith('$'): score = 0.6 elif constraint != '': score = 0.5 if score: for api in apis: yield score, api['entity_long'] # append date match... if regex.date_yyyymmdd_pattern.match(keyword): yield 0.95, 'date'
def dasql_to_nl(dasql_tuple): """ Returns natural language representation of a generated DAS query so to explain users what does it mean. """ # TODO: get rid of dasql_tuple, use a namedtuple or dict!? (result_type, short_input_params, result_projections, result_filters, result_operators) = dasql_tuple filters = ['%s=%s' % (f, v) for (f, v) in short_input_params] get_title = lambda field: \ get_schema().get_result_field_title(result_type, field, technical=True, html=True) if result_filters: # TODO: add verbose name if any filters.extend([ '{0:s} {1:s} {2:s}'.format(get_title(field), op, val) for (field, op, val) in result_filters]) filters = ' <b>AND</b> '.join(filters) if result_projections: projections = ', '.join(str(get_title(field)) for field in result_projections) return '<b>find</b> {projections:s} ' \ '<b>for each</b> {result_type:s} ' \ '<b>where</b> {filters:s}'.format(projections=projections, result_type=result_type, filters=filters) else: return '<b>find</b> {result_type:s} <b>where</b> {filters:s}'.format( result_type=result_type, filters=filters)
def dasql_to_nl(dasql_tuple): """ Returns natural language representation of a generated DAS query so to explain users what does it mean. """ # TODO: get rid of dasql_tuple, use a namedtuple or dict!? (result_type, short_input_params, result_projections, result_filters, result_operators) = dasql_tuple filters = ['%s=%s' % (f, v) for (f, v) in short_input_params] get_title = lambda field: \ get_schema().get_result_field_title(result_type, field, technical=True, html=True) if result_filters: # TODO: add verbose name if any filters.extend([ '{0:s} {1:s} {2:s}'.format(get_title(field), op, val) for (field, op, val) in result_filters ]) filters = ' <b>AND</b> '.join(filters) if result_projections: projections = ', '.join( str(get_title(field)) for field in result_projections) return '<b>find</b> {projections:s} ' \ '<b>for each</b> {result_type:s} ' \ '<b>where</b> {filters:s}'.format(projections=projections, result_type=result_type, filters=filters) else: return '<b>find</b> {result_type:s} <b>where</b> {filters:s}'.format( result_type=result_type, filters=filters)
def __init__(self, cms_rep, render_template): self.cms_rep = cms_rep self.dasmgr = self.cms_rep.dasmgr self.entity_names = self._build_short_daskeys(self.dasmgr) # schema adapter from kws # TODO: get_field_list_for_entity_by_pk could be moved to DAS Core or... self.schema_adapter = get_schema(dascore=self.dasmgr) self.render_template = render_template
def __init__(self, cms_rep, render_template): self.cms_rep = cms_rep self.dasmgr = self.cms_rep.dasmgr self.entity_names = self._build_short_daskeys(self.dasmgr) # schema adapter from kws # TODO: get_field_list_for_entity_by_pk could be moved to DAS Core or... self.schema_adapter = get_schema(dascore=self.dasmgr) self.render_template = render_template
def add_full_fieldmatch(kwd, matches): """ check for full match to am attribute, e.g. dataset.nevents """ if '.' in kwd: match = get_schema().check_result_field_match(kwd) if match: entity, field = match result = {'field': field, 'len': 1, 'tokens_required': [kwd, ], 'score': 20.0} matches[entity].append(result)
def manual_tests(): """ manual tests """ from DAS.keywordsearch.metadata.schema_adapter_factory import get_schema from DAS.core.das_core import DASCore schema_adapter = get_schema(DASCore(multitask=False)) fields_by_entity = schema_adapter.list_result_fields() ir_matcher = SimpleIREntityAttributeMatcher(fields_by_entity) def print_results(*args, **kwargs): """ run search and print results - used for testsing """ ir_matcher.search_index(*args, **kwargs) if False: print_results( keywords=u'files of Zmm with number of events more than 10', result_type=u'dataset') print_results(keywords=u'number events', result_type=u'dataset') print_results(keywords=u'number evented', result_type=u'dataset') print_results(keywords=u'dataset.nevents', result_type=u'dataset') print_results(keywords=u'dataset.numevents', result_type=u'dataset') # block.replica.subscribed vs block.replica.custodial # (the deepest name in here is the most important) print_results(keywords=u'replica fraction', result_type=u'block') print_results(keywords=u'replica fraction', result_type=u'site') print_results(keywords=u'custodial replica', result_type=u'block') print_results(keywords=u'replica_fraction', result_type=u'site') print('=========================================================') print_results(keywords=u'number', result_type=u'dataset') print_results(keywords=u'of', result_type=u'dataset') print_results(keywords=u'events', result_type=u'dataset') print_results(keywords=u'number of', result_type=u'dataset') print_results(keywords=u'of events', result_type=u'dataset') print_results(keywords=u'Number OF Events', result_type=u'dataset') print('Q: dataset_fraction') print_results(keywords=u'dataset_fraction', result_type=u'site') print('Q: dataset fraction') print_results(keywords=u'dataset fraction', result_type=u'site') print('Q: dataset part') print_results(keywords=u'dataset part', result_type=u'site') print('============================================') print('Q: file') print_results(keywords=u'file in', result_type='file', limit=4) print('============================================') print('Q: file in') print_results(keywords=u'file in', result_type='file', limit=4)
def manual_tests(): """ manual tests """ from DAS.keywordsearch.metadata.schema_adapter_factory import get_schema from DAS.core.das_core import DASCore schema_adapter = get_schema(DASCore(multitask=False)) fields_by_entity = schema_adapter.list_result_fields() ir_matcher = SimpleIREntityAttributeMatcher(fields_by_entity) def print_results(*args, **kwargs): """ run search and print results - used for testsing """ ir_matcher.search_index(*args, **kwargs) if False: print_results( keywords=u'files of Zmm with number of events more than 10', result_type=u'dataset') print_results(keywords=u'number events', result_type=u'dataset') print_results(keywords=u'number evented', result_type=u'dataset') print_results(keywords=u'dataset.nevents', result_type=u'dataset') print_results(keywords=u'dataset.numevents', result_type=u'dataset') # block.replica.subscribed vs block.replica.custodial # (the deepest name in here is the most important) print_results(keywords=u'replica fraction', result_type=u'block') print_results(keywords=u'replica fraction', result_type=u'site') print_results(keywords=u'custodial replica', result_type=u'block') print_results(keywords=u'replica_fraction', result_type=u'site') print '=========================================================' print_results(keywords=u'number', result_type=u'dataset') print_results(keywords=u'of', result_type=u'dataset') print_results(keywords=u'events', result_type=u'dataset') print_results(keywords=u'number of', result_type=u'dataset') print_results(keywords=u'of events', result_type=u'dataset') print_results(keywords=u'Number OF Events', result_type=u'dataset') print 'Q: dataset_fraction' print_results(keywords=u'dataset_fraction', result_type=u'site') print 'Q: dataset fraction' print_results(keywords=u'dataset fraction', result_type=u'site') print 'Q: dataset part' print_results(keywords=u'dataset part', result_type=u'site') print '============================================' print 'Q: file' print_results(keywords=u'file in', result_type='file', limit=4) print '============================================' print 'Q: file in' print_results(keywords=u'file in', result_type='file', limit=4)
def __init__(self, dascore): self.schema = get_schema(dascore) # import and initialize the ranker from DAS.extensions import fast_recursive_ranker self.ranker = fast_recursive_ranker self.ranker.initialize_ranker(self.schema) # build and load the whoosh index (listing fields in service outputs) fields = self.schema.list_result_fields() self.multi_kwd_searcher = MultiKwdAttributeMatcher(fields) # initialize the value trackers (primary_dataset, release, etc) init_value_trackers()
def __init__(self, dascore): self.schema = get_schema(dascore) # import and initialize the ranker from DAS.extensions import fast_recursive_ranker self.ranker = fast_recursive_ranker self.ranker.initialize_ranker(self.schema) # build and load the whoosh index (listing fields in service outputs) fields = self.schema.list_result_fields() self.multi_kwd_searcher = MultiKwdAttributeMatcher(fields) # initialize the value trackers (primary_dataset, release, etc) init_value_trackers()
def manual_tests(): """ manual tests """ from DAS.keywordsearch.metadata.schema_adapter_factory import get_schema from DAS.core.das_core import DASCore schema_adapter = get_schema(DASCore(multitask=False)) fields_by_entity = schema_adapter.list_result_fields() ir_matcher = SimpleIREntityAttributeMatcher(fields_by_entity) def print_results(*args, **kwargs): """ run search and print results - used for testsing """ ir_matcher.search_index(*args, **kwargs) if False: print_results(keywords=u"files of Zmm with number of events more than 10", result_type=u"dataset") print_results(keywords=u"number events", result_type=u"dataset") print_results(keywords=u"number evented", result_type=u"dataset") print_results(keywords=u"dataset.nevents", result_type=u"dataset") print_results(keywords=u"dataset.numevents", result_type=u"dataset") # block.replica.subscribed vs block.replica.custodial # (the deepest name in here is the most important) print_results(keywords=u"replica fraction", result_type=u"block") print_results(keywords=u"replica fraction", result_type=u"site") print_results(keywords=u"custodial replica", result_type=u"block") print_results(keywords=u"replica_fraction", result_type=u"site") print("=========================================================") print_results(keywords=u"number", result_type=u"dataset") print_results(keywords=u"of", result_type=u"dataset") print_results(keywords=u"events", result_type=u"dataset") print_results(keywords=u"number of", result_type=u"dataset") print_results(keywords=u"of events", result_type=u"dataset") print_results(keywords=u"Number OF Events", result_type=u"dataset") print("Q: dataset_fraction") print_results(keywords=u"dataset_fraction", result_type=u"site") print("Q: dataset fraction") print_results(keywords=u"dataset fraction", result_type=u"site") print("Q: dataset part") print_results(keywords=u"dataset part", result_type=u"site") print("============================================") print("Q: file") print_results(keywords=u"file in", result_type="file", limit=4) print("============================================") print("Q: file in") print_results(keywords=u"file in", result_type="file", limit=4)
def add_full_fieldmatch(kwd, matches): """ check for full match to am attribute, e.g. dataset.nevents """ if '.' in kwd: match = get_schema().check_result_field_match(kwd) if match: entity, field = match result = { 'field': field, 'len': 1, 'tokens_required': [ kwd, ], 'score': 20.0 } matches[entity].append(result)
def need_res_fields_bootsrap(): """ return whether the list of of entity attributes is available if not these are needed to be bootstrapped """ dascore = DASCore(multitask=False) schema_adapter = schema_adapter_factory.get_schema(dascore) try: field_list = schema_adapter.list_result_fields() if not field_list: return True ir_entity_attributes.SimpleIREntityAttributeMatcher(field_list) except Exception as exc: print exc return True return False
def append_subquery_matches(self, keywords, matches): """ get matches to individual and nearby keywords (non phrase) """ # check for full name matches to a attribute, e.g. dataset.nevents for kwd in keywords: add_full_fieldmatch(kwd, matches) fields_by_entity = get_schema().list_result_fields() str_len = len(keywords) max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN) for length in xrange(1, max_len + 1): for start in xrange(0, str_len - length + 1): chunk = keywords[start:start + length] # exclude phrases with "a b c" (as these were processed earlier) if any(c for c in chunk if ' ' in c): continue # only the last term in the chunk is allowed to contain operator if any(test_operator_containment(kw) for kw in chunk[:-1]): continue if DEBUG: print('chunk:', chunk) print('len=', length, '; start=', start, 'chunk:', chunk) s_chunk = ' '.join(get_keyword_without_operator(kw) for kw in chunk) results = self.fields_idx.search_index( kwds=s_chunk, limit=CHUNK_N_TOKEN_COMBINATION_RESULTS) max_score = results and results[0]['score'] for result in results: result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = chunk if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result)
def append_subquery_matches(self, keywords, matches): """ get matches to individual and nearby keywords (non phrase) """ # check for full name matches to a attribute, e.g. dataset.nevents for kwd in keywords: add_full_fieldmatch(kwd, matches) fields_by_entity = get_schema().list_result_fields() str_len = len(keywords) max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN) for length in xrange(1, max_len + 1): for start in xrange(0, str_len - length + 1): chunk = keywords[start:start + length] # exclude phrases with "a b c" (as these were processed earlier) if any(c for c in chunk if ' ' in c): continue # only the last term in the chunk is allowed to contain operator if any(test_operator_containment(kw) for kw in chunk[:-1]): continue if DEBUG: print('chunk:', chunk) print('len=', length, '; start=', start, 'chunk:', chunk) s_chunk = ' '.join( get_keyword_without_operator(kw) for kw in chunk) results = self.fields_idx.search_index( kwds=s_chunk, limit=CHUNK_N_TOKEN_COMBINATION_RESULTS) max_score = results and results[0]['score'] for result in results: result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = chunk if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result)
def get_phrase_matches(self, keywords): """ get phrase matches from IR index """ fields_by_entity = get_schema().list_result_fields() # first filter out the phrases (we wont combine them with anything) phrase_kwds = [kw for kw in keywords if ' ' in kw] matches = defaultdict(list) for kwd in phrase_kwds: # remove operators, e.g. "number of events">10 => number of events phrase = get_keyword_without_operator(kwd) # get ranked list of matches results = self.fields_idx.search_index( kwds=phrase, limit=CHUNK_N_PHRASE_RESULTS) max_score = results and results[0]['score'] for result in results: #r['len'] = 1 result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue # TODO: this shall be done in presentation level result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = [kwd] # penalize terms that have multiple matches result['score'] *= W_PHRASE if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result) return matches
def get_phrase_matches(self, keywords): """ get phrase matches from IR index """ fields_by_entity = get_schema().list_result_fields() # first filter out the phrases (we wont combine them with anything) phrase_kwds = [kw for kw in keywords if ' ' in kw] matches = defaultdict(list) for kwd in phrase_kwds: # remove operators, e.g. "number of events">10 => number of events phrase = get_keyword_without_operator(kwd) # get ranked list of matches results = self.fields_idx.search_index(kwds=phrase, limit=CHUNK_N_PHRASE_RESULTS) max_score = results and results[0]['score'] for result in results: #r['len'] = 1 result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue # TODO: this shall be done in presentation level result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = [kwd] # penalize terms that have multiple matches result['score'] *= W_PHRASE if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result) return matches
def result_to_dasql(result, frmt='text', shorten_html=True, max_value_len=UI_MAX_DISPLAYED_VALUE_LEN): """ returns proposed query as DASQL in there formats: - text, standard DASQL - html, colorified DASQL with long values shortened down (if shorten_html is specified) """ patterns = DASQL_PATTERNS[frmt] def tmpl(name, params=None): """ gets a pattern, formats it with params if any, and apply an escape function if needed """ # a helper function to map values of dict # TODO: in Py2.7: {k: f(v) for k, v in my_dictionary.items()} map_dict_values = lambda f, my_dict: dict( (k, f(v)) for k, v in my_dict.items()) if frmt == 'html': # shorten value if it's longer than if isinstance(params, dict) and 'value' in params and shorten_html: value = params['value'] if len(value) > max_value_len: params['value'] = shorten_value(value, max_value_len) params['field'] = fescape(params['field']) else: # for html, make sure to escape the inputs if isinstance(params, tuple) or isinstance(params, list): params = tuple(fescape(param) for param in params) elif isinstance(params, dict): params = map_dict_values(fescape, params) else: params = params and fescape(params) pattern = patterns[name] if params is not None: return pattern % params return pattern if isinstance(result, dict): score = result['score'] result_type = result['result_type'] input_params = result['input_values'] projections_filters = result['result_filters'] trace = result['trace'] else: (score, result_type, input_params, projections_filters, trace) = result # short entity names s_result_type = get_schema().entity_names.get(result_type, result_type) s_input_params = [(get_schema().entity_names.get(field, field), value) for (field, value) in input_params] s_input_params.sort(key=lambda item: item[0]) s_query = tmpl('RESULT_TYPE', s_result_type) + ' ' + \ ' '.join(tmpl('INPUT_FIELD_AND_VALUE', {'field': field, 'value': value}) for (field, value) in s_input_params) result_projections = [ p for p in projections_filters if not isinstance(p, tuple) ] result_filters = [p for p in projections_filters if isinstance(p, tuple)] if result_projections or result_filters: if DEBUG: print('selections before:', result_projections) result_projections = list(result_projections) # automatically add wildcard fields to selections (if any), # so they would be displayed in the results for field, value in input_params: if '*' in value and not field in result_projections: result_projections.append(field) # add formatted projections result_grep = [tmpl('PROJECTION', prj) for prj in result_projections] # add filters to grep s_result_filters = [ tmpl('RESULT_FILTER_OP_VALUE', { 'field': field, 'op': op, 'value': val }) for (field, op, val) in result_filters ] result_grep.extend(s_result_filters) s_query += tmpl('GREP') + ', '.join(result_grep) if DEBUG: print('projections after:', result_projections) print('filters after:', result_filters) return { 'result': s_query, 'query': s_query, 'trace': trace, 'score': score, 'entity': s_result_type, 'das_ql_tuple': (s_result_type, s_input_params, result_projections, result_filters, []) }
def result_to_dasql(result, frmt='text', shorten_html=True, max_value_len=UI_MAX_DISPLAYED_VALUE_LEN): """ returns proposed query as DASQL in there formats: * text - standard DASQL * html - colorified DASQL with long values shortened down (if shorten_html is specified) """ patterns = DASQL_PATTERNS[frmt] def tmpl(name, params=None): """ gets a pattern, formats it with params if any, and apply an escape function if needed """ # a helper function to map values of dict # TODO: in Py2.7: {k: f(v) for k, v in my_dictionary.items()} map_dict_values = lambda f, my_dict: dict( (k, f(v)) for k, v in my_dict.iteritems()) if frmt == 'html': # shorten value if it's longer than if isinstance(params, dict) and 'value' in params and shorten_html: value = params['value'] if len(value) > max_value_len: params['value'] = shorten_value(value, max_value_len) params['field'] = fescape(params['field']) else: # for html, make sure to escape the inputs if isinstance(params, tuple) or isinstance(params, list): params = tuple(fescape(param) for param in params) elif isinstance(params, dict): params = map_dict_values(fescape, params) else: params = params and fescape(params) pattern = patterns[name] if params is not None: return pattern % params return pattern if isinstance(result, dict): score = result['score'] result_type = result['result_type'] input_params = result['input_values'] projections_filters = result['result_filters'] trace = result['trace'] else: (score, result_type, input_params, projections_filters, trace) = result # short entity names s_result_type = get_schema().entity_names.get(result_type, result_type) s_input_params = [(get_schema().entity_names.get(field, field), value) for (field, value) in input_params] s_input_params.sort(key=lambda item: item[0]) s_query = tmpl('RESULT_TYPE', s_result_type) + ' ' + \ ' '.join(tmpl('INPUT_FIELD_AND_VALUE', {'field': field, 'value': value}) for (field, value) in s_input_params) result_projections = [p for p in projections_filters if not isinstance(p, tuple)] result_filters = [p for p in projections_filters if isinstance(p, tuple)] if result_projections or result_filters: if DEBUG: print 'selections before:', result_projections result_projections = list(result_projections) # automatically add wildcard fields to selections (if any), # so they would be displayed in the results for field, value in input_params: if '*' in value and not field in result_projections: result_projections.append(field) # add formatted projections result_grep = [tmpl('PROJECTION', prj) for prj in result_projections] # add filters to grep s_result_filters = [tmpl('RESULT_FILTER_OP_VALUE', {'field': field, 'op': op, 'value': val}) for (field, op, val) in result_filters] result_grep.extend(s_result_filters) s_query += tmpl('GREP') + ', '.join(result_grep) if DEBUG: print 'projections after:', result_projections print 'filters after:', result_filters return { 'result': s_query, 'query': s_query, 'trace': trace, 'score': score, 'entity': s_result_type, 'das_ql_tuple': (s_result_type, s_input_params, result_projections, result_filters, []) }