def get_cleaned_options(self, kwargs): ''' Separate URL keyword arguments into their functional category. Incoming kwargs are separated into one of 4 categories, depending on how the argument controls the pipeline: * ``control_kwargs`` - These are arguments that control the pipeline execution (**raw**, **rawquery**, etc) * ``es_kwargs`` - These are arguments that get passed directly to the Elasticsearch client during query * ``esqb_kwargs`` - These are arguments that go to the Elasticsearch query builder (**fields**, **size**, etc) * ``transform_kwargs`` - These are arguments that go to the Elasticsearch result transformer (**jsonld**, **dotfield**, etc)''' options = dotdict() # split kwargs into one (or more) of 4 categories: # * control_kwargs: kwargs that control aspects of the handler's pipeline (e.g. raw, rawquery) # * es_kwargs: kwargs that go directly to the ES query (e.g. fields, size, ...) # * esqb_kwargs: kwargs that go directly to the ESQueryBuilder instance # * transform_kwargs: kwargs that go directly to the response transformer (e.g. jsonld, dotfield) for kwarg_category in [ "control_kwargs", "es_kwargs", "esqb_kwargs", "transform_kwargs" ]: options.setdefault(kwarg_category, dotdict()) for option, settings in getattr(self, kwarg_category, {}).items(): if kwargs.get(option, None) or settings.get('default', None) is not None: options.get(kwarg_category).setdefault( option, kwargs.get(option, settings['default'])) # check here for userquery kwargs if re.match(self.web_settings.USERQUERY_KWARG_REGEX, option) and kwarg_category == "esqb_kwargs": options.esqb_kwargs.setdefault('userquery_kwargs', dotdict()) options.esqb_kwargs.userquery_kwargs[ self.web_settings.USERQUERY_KWARG_TRANSFORM( option)] = kwargs.get(option) return options
async def execute_pipeline(self, *args, **kwargs): try: graph_query = GraphQuery.from_dict(self.args_json) es_query = self._to_es_query(graph_query) if graph_query.can_reverse(): graph_query.reverse() es_query_rev = self._to_es_query(graph_query) es_query = es_query | es_query_rev # it's sent in one query so that parameters like size is still meaningful _query = AsyncSearch().query(es_query) _res = await self.pipeline.execute(_query, dotdict()) res = self.pipeline.transform(_res, dotdict()) # TODO additional transformation, like double reversal in result. except GraphObjectError as exc: raise BadRequest(reason=str(exc)) except Exception as exc: raise HTTPError(str(exc)) self.finish(res)
def _get_cleaned_metadata_options(self, kwargs): """ Process options for /metadata query. """ options = dotdict() # Delete all keys, can override this to add arguments to metadata endpoint for key in set(kwargs.keys()): del (kwargs[key]) return options
def _get_cleaned_common_options(self, kwargs): '''process options whatever the type of query (/query or annotation)''' options = dotdict() options.raw = kwargs.pop('raw', False) options.rawquery = kwargs.pop('rawquery', False) options.fetch_all = kwargs.pop('fetch_all', False) options.host = kwargs.pop('host', biothing_settings.ga_tracker_url) options.jsonld = kwargs.pop('jsonld', False) options.dotfield = kwargs.pop('dotfield', False) not in [False, 'false'] #if no dotfield in "fields", set dotfield always be True, i.e., no need to parse dotfield if not options.dotfield: _found_dotfield = False if kwargs.get('fields'): for _f in kwargs['fields']: if _f.find('.') != -1: _found_dotfield = True break if not _found_dotfield: options.dotfield = True options = self._get_options(options, kwargs) scopes = kwargs.pop('scopes', None) if scopes: options.scopes = self._cleaned_scopes(scopes) kwargs = parse_sort_option(kwargs) for key in set(kwargs) - set(self._allowed_options): logging.debug("removing param '%s' from query" % key) del kwargs[key] return options
def query_dataset(self, chrom, start, ref, alt, assembly, dataset): # Initialzie output out = {'datasetId': dataset, 'exists': False} q_type = 'snp' # verify information and build query string if dataset in self.pos_dbs + self.assembly_dbs: if chrom and start and alt and assembly in self.assembly_keys: assembly = self.assembly_keys[assembly] # get hg assembly notation if alt[:3] == 'DEL': # syntax: "alternateBases": "DEL85689" q_type = 'del' ref = '' elif alt[:3] == 'DUP': # "alternateBases": "DUP85689" q_type = 'dup' ref = '' elif not ref: q_type = 'ins' ref = '' q = self.format_query_string(q_type, chrom, start, ref, alt, assembly, dataset) # perform query and format result # for now always search against hg19 index... res = self.web_settings.es_client.search( index=self.web_settings.ES_INDICES[assembly], body={"query": {"query_string": {"query": q}}}, _source=[dataset] ) res = self.result_transform.transform(res, dotdict(dotfield=True)) if res and res.get('total') > 0: out = self.format_output(res, out, q_type) return out
def query_dataset(self, chrom, start, ref, alt, assembly, dataset): # Initialzie output out = {'datasetId': dataset, 'exists':False} q_type = 'snp' # verify information and build query string if dataset in self.pos_dbs+self.assembly_dbs: if chrom and start and alt and assembly in self.assembly_keys: assembly = self.assembly_keys[assembly] #get hg assembly notation if alt[:3] == 'DEL': # syntax: "alternateBases": "DEL85689" q_type = 'del' ref = '' elif alt[:3] == 'DUP': # "alternateBases": "DUP85689" q_type = 'dup' ref = '' elif not ref: q_type = 'ins' ref = '' q = self.format_query_string(q_type, chrom, start, ref, alt, assembly, dataset) # perform query and format result # for now always search against hg19 index... res = self.web_settings.es_client.search(index='_'.join([self.web_settings.ES_INDEX_BASE, 'hg19']), doc_type=self.web_settings.ES_DOC_TYPE, body={"query":{"query_string":{"query":q}}}, _source=[dataset]) _transformer = ESResultTransformer(options=dotdict({'dotfield': True}), host=self.request.host) res = _transformer.clean_query_GET_response(res) if res and res.get('total') > 0: out = self.format_output(res, out, q_type) return out
def initialize(self, web_settings): super(BeaconInfoHandler, self).initialize(web_settings) _meta = self.web_settings.es_client.indices.get_mapping(index='_'.join([self.web_settings.ES_INDEX_BASE, 'hg19']), doc_type=self.web_settings.ES_DOC_TYPE) self.m = _meta[list(_meta.keys())[0]]['mappings'][self.web_settings.ES_DOC_TYPE]['properties'] _transformer = ESResultTransformer(options=dotdict(), host=self.request.host) self.meta = _transformer.clean_metadata_response(_meta)
def _get_cleaned_metadata_options(self, kwargs): options = dotdict() this_assembly = kwargs.pop('assembly', myvariant_settings.default_assembly).lower() options.assembly = this_assembly if this_assembly in myvariant_settings.supported_assemblies else myvariant_settings.default_assembly options.chromosome = kwargs.pop('chromosome', False) for key in set(kwargs.keys()): del(kwargs[key]) kwargs = {} return options
def initialize(self, web_settings): super(BeaconInfoHandler, self).initialize(web_settings) _meta = self.web_settings.es_client.indices.get_mapping( index='_'.join([self.web_settings.ES_INDEX_BASE, 'hg19']), doc_type=self.web_settings.ES_DOC_TYPE) self.m = _meta[list(_meta.keys())[0]]['mappings'][ self.web_settings.ES_DOC_TYPE]['properties'] _transformer = ESResultTransformer(options=dotdict(), host=self.request.host) self.meta = _transformer.clean_metadata_response(_meta)
def transform(self, response, options): """ Transform the query response to a user-friendly structure. Mainly deconstruct the elasticsearch response structure and hand over to transform_doc to apply the options below. Options: dotfield: flatten a dictionary using dotfield notation _sorted: sort keys alaphabetically in ascending order always_list: ensure the fields specified are lists or wrapped in a list allow_null: ensure the fields specified are present in the result, the fields may be provided as type None or []. biothing_type: result document type to apply customized transformation. for example, add license field basing on document type's metadata. # only related to multiqueries template: base dict for every result, for example: {"success": true} templates: a different base for every result, replaces the setting above template_hit: a dict to update every positive hit result, default: {"found": true} template_miss: a dict to update every query with no hit, default: {"found": false} """ if not isinstance(options, dotdict): options = dotdict(options) if isinstance(response, list): responses_ = [] template = options.pop('template', {}) templates = options.pop('templates', [template] * len(response)) template_hit = options.pop('template_hit', dict(found=True)) template_miss = options.pop('template_miss', dict(found=False)) responses = [self.transform(res, options) for res in response] for res_, res in zip(templates, responses): if not res.get('hits'): res_.update(template_miss) responses_.append(res_) else: for hit in res['hits']: hit_ = dict(res_) hit_.update(template_hit) hit_.update(hit) responses_.append(hit_) return list(filter(None, responses_)) if isinstance(response, dict): response.update(response.pop('hits', {})) # collapse one level response.pop('_shards') response.pop('timed_out') if 'hits' in response: for hit in response['hits']: hit.update(hit.pop('_source', {})) # collapse one level self.transform_doc(hit, options) if 'aggregations' in response: self.transform_aggs(response['aggregations']) response['facets'] = response.pop('aggregations') response['hits'] = response.pop('hits') # order return response return {}
def _to_es_query(self, graph_query): """ Takes a GraphQuery object and return an ES query. """ assert isinstance(graph_query, GraphQuery) q = graph_query.to_dict() self.pipeline.result_transform.option_dotfield(q, dotdict()) _q = [] _scopes = [] for k, v in q.items(): if isinstance(v, list): for _v in v: _q.append(_v) _scopes.append(k) else: _q.append(v) _scopes.append(k) return self.pipeline.query_builder.default_match_query( _q, _scopes, dotdict()).query._proxied
def build_graph_query(self, q, reverse=False, **options): query = self._build_graph_query(q) if reverse and q.reversible(): _q = deepcopy(q) _q.reverse() query = query | self._build_graph_query(_q) search = Search().query(query) if query else Search() search = self.apply_extras(search, dotdict(options)) return search
def build(self, q=None, **options): """ Build a query according to q and options. This is the public method called by API handlers. Regarding scopes: scopes: [str] nonempty, match query. scopes: NoneType, or [], no scope, so query string query. Additionally support these options: explain: include es scoring information userquery: customized function to interpret q * additional keywords are passed through as es keywords for example: 'explain', 'version' ... * multi-search is supported when q is a list. all queries are built individually and then sent in one request. """ options = dotdict(options) if options.scroll_id: # bypass all query building stages return ESScrollID(options.scroll_id) if options.fetch_all: # clean up conflicting parameters options.pop('sort', None) options.pop('size', None) try: # process single q vs list of q(s). # dispatch 'val' vs 'key:val' to corresponding functions. if isinstance(q, list): search = MultiSearch() for _q in q: _search = self._build_one(_q, options) search = search.add(_search) else: # str, int ... search = self._build_one(q, options) except IllegalOperation as exc: raise ValueError(str(exc)) # ex. sorting by -_score if options.get('rawquery'): raise RawQueryInterrupt(search.to_dict()) return search
def transform(self, response, options): """ Transform the query result. TODO more """ if not isinstance(options, dotdict): options = dotdict(options) if isinstance(response, list): responses_ = [] template = options.pop('template', {}) templates = options.pop('templates', [template] * len(response)) template_hit = options.pop('template_hit', dict(found=True)) template_miss = options.pop('template_miss', dict(found=False)) responses = [self.transform(res, options) for res in response] for res_, res in zip(templates, responses): if not res.get('hits'): res_.update(template_miss) responses_.append(res_) else: for hit in res['hits']: hit_ = dict(res_) hit_.update(template_hit) hit_.update(hit) responses_.append(hit_) return list(filter(None, responses_)) if isinstance(response, dict): response.update(response.pop('hits', {})) # collapse one level response.pop('_shards') response.pop('timed_out') if 'hits' in response: for hit in response['hits']: hit.update(hit.pop('_source', {})) # collapse one level for path, obj in self.traverse(hit): self.transform_hit(path, obj, options) if options.allow_null: self.option_allow_null(path, obj, options.allow_null) if options.always_list: self.option_always_list(path, obj, options.always_list) if options._sorted: self.option_sorted(path, obj) if options.dotfield: self.option_dotfield(hit, options) if 'aggregations' in response: self.transform_aggregations(response['aggregations']) response['facets'] = response.pop('aggregations') response['hits'] = response.pop('hits') # order return response return {}
async def include_children(self, res, options): # modify in-place """ Make additional queries to get the children field content. """ # msearch result if isinstance(res, list): for search in res: await self.include_children(search, options) return try: # single query for hit in res['hits']['hits']: query = MytaxonQueryBuilder.build_lineage_query(hit['_id'], options) hit['children'] = await super().execute(query, dotdict()) except KeyError: pass
def query_dataset(self, chrom, start, ref, alt, assembly, dataset): # Initialzie output out = {'datasetId': dataset, 'exists': False} q_type = 'snp' # verify information and build query string if dataset in self.pos_dbs + self.assembly_dbs: if chrom and start and alt and assembly in self.assembly_keys: assembly = self.assembly_keys[ assembly] #get hg assembly notation if alt[:3] == 'DEL': # syntax: "alternateBases": "DEL85689" q_type = 'del' ref = '' elif alt[:3] == 'DUP': # "alternateBases": "DUP85689" q_type = 'dup' ref = '' elif not ref: q_type = 'ins' ref = '' q = self.format_query_string(q_type, chrom, start, ref, alt, assembly, dataset) # perform query and format result # for now always search against hg19 index... res = self.web_settings.es_client.search( index='_'.join([self.web_settings.ES_INDEX_BASE, assembly]), doc_type=self.web_settings.ES_DOC_TYPE, body={"query": { "query_string": { "query": q } }}, _source=[dataset]) _transformer = ESResultTransformer( options=dotdict({ 'dotfield': True, 'assembly': assembly }), host=self.request.host, source_metadata=self.web_settings.source_metadata()) res = _transformer.clean_query_GET_response(res) if res and res.get('total') > 0: out = self.format_output(res, out, q_type) return out
def _get_cleaned_common_options(self, kwargs): """process options whatever the type of query (/query or annotation)""" options = dotdict() options.raw = kwargs.pop("raw", False) options.rawquery = kwargs.pop("rawquery", False) options.fetch_all = kwargs.pop("fetch_all", False) options.host = kwargs.pop("host", biothing_settings.ga_tracker_url) options.jsonld = kwargs.pop("jsonld", False) options.dotfield = kwargs.pop("dotfield", False) # override to add more options options = self._get_options(options, kwargs) scopes = kwargs.pop("scopes", None) if scopes: options.scopes = self._cleaned_scopes(scopes) kwargs = parse_sort_option(kwargs) for key in set(kwargs) - set(self._allowed_options): logging.debug("removing param '%s' from query" % key) del kwargs[key] return options
def _get_cleaned_query_options(self, kwargs): """common helper for processing fields, kwargs and other options passed to ESQueryBuilder.""" options = dotdict() options.raw = kwargs.pop('raw', False) options.rawquery = kwargs.pop('rawquery', False) options.fetch_all = kwargs.pop('fetch_all', False) options.host = kwargs.pop('host', self._settings.ga_tracker_url) options = self._get_options(options, kwargs) scopes = kwargs.pop('scopes', None) if scopes: options.scopes = self._cleaned_scopes(scopes) fields = kwargs.pop('fields', None) if fields: fields = self._cleaned_fields(fields) if fields: kwargs["_source"] = fields kwargs = self._parse_sort_option(kwargs) for key in set(kwargs) - set(self._allowed_options): del kwargs[key] options.kwargs = kwargs return options
def _build_graph_query(self, graph_query): """ Takes a GraphQuery object and return an ES Query object. """ assert isinstance(graph_query, GraphQuery) q = graph_query.to_dict() _q = [] _scopes = [] for k, v in traverse(q, True): if isinstance(v, list): for _v in v: _q.append(_v) _scopes.append(k) else: _q.append(v) _scopes.append(k) # query proxy object does not support OR operator, thus using _proxied return self._build_match_query(_q, _scopes, dotdict()).query._proxied
def parse(self, method, reqargs): """ Parse a HTTP request, represented by its method and args, with this OptionSet and return an attribute dictionary. """ options = self.optset.get(method, self.optset["*"]) result = defaultdict(dict) # to accomodate groups for keyword, option in options.items(): try: val = option.parse(reqargs) except OptionError as err: err.info.setdefault("keyword", keyword) err.info["alias"] = option.get("alias") err.simplify() # remove empty fields raise err # with helpful info if val is not None: # TODO: build a new ds for returned result if 'group' in option: group = option['group'] if isinstance(group, str): result[group][keyword] = val else: # assume iterable for _group in group: result[_group][keyword] = val else: # top level keywords result[keyword] = val # make sure all named groups exist for group in self.groups: if group not in result: result[group] = {} return dotdict(result)
def parse(self, method, args, path_args, path_kwargs): result = defaultdict(dict) options = {} rules = [] # expand * to kwarg_methods setting if not self._methods or method in self._methods: rules += list(self._options['*'].items()) rules += list(self._options[method].items()) # method precedence: specific > * for keyword, setting in rules: options[keyword] = setting # setting + inputs -> arg value for keyword, setting in options.items(): arg = OptionArg(keyword, setting) val = arg.parse(args, path_args, path_kwargs) # discard no default value if val is not None: if 'group' in setting: group = setting['group'] if isinstance(group, str): result[group][keyword] = val else: # assume iterable for _group in group: result[_group][keyword] = val else: # top level keywords result[keyword] = val # make sure all named groups exist for group in self._groups: if group not in result: result[group] = {} return dotdict(result)
def __init__(self, client, options=dotdict()): self.client = client self.options = options
def transform(self, response, **options): """ Transform the query response to a user-friendly structure. Mainly deconstruct the elasticsearch response structure and hand over to transform_doc to apply the options below. Options: # generic transformations for dictionaries # ------------------------------------------ dotfield: flatten a dictionary using dotfield notation _sorted: sort keys alaphabetically in ascending order always_list: ensure the fields specified are lists or wrapped in a list allow_null: ensure the fields specified are present in the result, the fields may be provided as type None or []. # additional multisearch result transformations # ------------------------------------------------ template: base dict for every result, for example: {"success": true} templates: a different base for every result, replaces the setting above template_hit: a dict to update every positive hit result, default: {"found": true} template_miss: a dict to update every query with no hit, default: {"found": false} # document format and content management # --------------------------------------- biothing_type: result document type to apply customized transformation. for example, add license field basing on document type's metadata. one: return the individual document if there's only one hit. ignore this setting if there are multiple hits. return None if there is no hit. this option is not effective when aggregation results are also returned in the same query. native: bool, if the returned result is in python primitive types. version: bool, if _version field is kept. score: bool, if _score field is kept. """ options = dotdict(options) if isinstance(response, list): responses_ = [] options.pop('one', None) # ignore template = options.pop('template', {}) templates = options.pop('templates', [template] * len(response)) template_hit = options.pop('template_hit', dict(found=True)) template_miss = options.pop('template_miss', dict(found=False)) responses = [self.transform(res, **options) for res in response] for tpl, res in zip(templates, responses): for _res in res if isinstance(res, list) else [res]: assert isinstance(_res, dict) if _res and 'hits' not in _res: hit_ = dict(tpl) hit_.update(template_hit) hit_.update(_res) responses_.append(hit_) continue if not _res or not _res['hits']: tpl.update(template_miss) responses_.append(tpl) continue for hit in _res['hits']: hit_ = dict(tpl) hit_.update(template_hit) hit_.update(hit) responses_.append(hit_) return list(filter(None, responses_)) if isinstance(response, dict): response = self._Hits(response) response.collapse('hits') response.exclude(('_shards', '_node', 'timed_out')) response.wrap('hits', self._Doc) for hit in response['hits']: hit.collapse('_source') # 'sort' is introduced when sorting hit.exclude(('_index', '_type', 'sort')) self._transform_hit(hit, options) if options.get('native', True): response['hits'] = [hit.data for hit in response['hits']] response = response.data if 'aggregations' in response: self.transform_aggs(response['aggregations']) response['facets'] = response.pop('aggregations') hits = response.pop('hits') # move key order if hits: # hide "hits" field when size=0 response['hits'] = hits elif options.get('one'): # prefer one-level presentation # or structures as simple as possible if len(response['hits']) == 1: response = response['hits'][0] elif len(response['hits']) == 0: response = None else: # show a list of docs response = response['hits'] return response raise TypeError()