def _children_query(self, ids, has_gene=True, include_self=False, raw=False): if is_str(ids) or isinstance(ids, int) or (is_seq(ids) and len(ids) == 1): _ids = ids if is_str(ids) or isinstance(ids, int) else ids[0] _qstring = "lineage:{} AND has_gene:true".format(_ids) if has_gene else "lineage:{}".format(_ids) res = self.options.es_client.search(body={"query":{"query_string":{"query": _qstring}}}, index=self.options.index, doc_type=self.options.doc_type, fields='_id', size=self.max_taxid_count) if raw: return res taxid_li = [int(x['_id']) for x in res['hits']['hits'] if x['_id'] != _ids or include_self] taxid_li += ([_ids] if include_self and _ids not in taxid_li else []) return {_ids: sorted(taxid_li)[:self.max_taxid_count]} elif is_seq(ids): qs = '\n'.join(['{{}}\n{{"size": {}, "_source": ["_id"], "query": {{"query_string":{{"query": "lineage:{} AND has_gene:true"}}}}}}'.format(self.max_taxid_count, taxid) if has_gene else '{{}}\n{{"size": {}, "_source": ["_id"], "query":{{"query_string":{{"query":"lineage:{}"}}}}}}'.format(self.max_taxid_count, taxid) for taxid in ids]) res = self.options.es_client.msearch(body=qs, index=self.options.index, doc_type=self.options.doc_type) if 'responses' not in res or len(res['responses']) != len(ids): return {} _ret = {} for (taxid, response) in zip(ids, res['responses']): _ret.setdefault(taxid, []).extend([h['_id'] for h in response['hits']['hits'] if h['_id'] != taxid or include_self]) for taxid in _ret.keys(): _ret[taxid] = sorted([int(x) for x in list(set(_ret[taxid]))] + ([int(taxid)] if include_self and taxid not in _ret[taxid] else []))[:self.max_taxid_count] return _ret else: return {}
def build_id_query(self, bid, scopes=None): _default_scopes = '_id' scopes = scopes or _default_scopes if is_str(scopes): _query = { "match": { scopes: { "query": "{}".format(bid), "operator": "and" } } } elif is_seq(scopes): _query = { "multi_match": { "query": "{}".format(bid), "fields": scopes, "operator": "and" } } else: raise ValueError('"scopes" cannot be "%s" type'.format(type(scopes))) _q = {"query": _query} self._query_options.pop("query", None) # avoid "query" be overwritten by self.query_options _q.update(self._query_options) return _q
def return_raw_query_json(self, query): '''Return valid JSON if `rawquery` option is selected. This is necessary as queries can span multiple lines (POST)''' _ret = query.get('body', {'GET': query.get('bid')}) if is_str(_ret) and len(_ret.split('\n')) > 1: self.return_json({'body': _ret}) else: self.return_json(_ret)
def transfer_debug(self, key, other): """ transfer debug information for one key in the IDStruct object """ # ensure lower case key if is_str(key): key = key.lower() # transfer debug information self.debug[key] = other.get_debug(key)
def return_raw_query_json(self, query, status_code=200, _format='json'): '''Return valid JSON if `rawquery` option is selected. This is necessary as queries can span multiple lines (POST)''' _ret = query.get('body', {'GET': query.get('bid')}) if is_str(_ret) and len(_ret.split('\n')) > 1: self.return_object({'body': _ret}, status_code=status_code, _format=_format) else: self.return_object(_ret, status_code=status_code, _format=_format)
def get_changes(self, source_col, use_parallel=True): target_col = self._target_col source_col = self._db[source_col] if is_str(source_col) else source_col src = GeneDocMongoDBBackend(source_col) target = GeneDocMongoDBBackend(target_col) changes = diff_collections(target, src, use_parallel=use_parallel, step=self.step) if changes: changes['source'] = source_col.name changes['timestamp'] = _get_timestamp(source_col.name) return changes
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set': { "unmatched_ref": "True" }}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def get_debug(self, key): """Get debug information for a given key""" # lowercase key if possible if is_str(key): key = key.lower() # return debug information if isinstance(key, list): return 'type(list)' try: return self.debug[key] except KeyError: return 'not-available'
def _cleaned_fields(self, fields): """return a cleaned fields parameter. should be either None (return all fields) or a list fields. """ if fields: if is_str(fields): if fields.lower() == "all": fields = None # all fields will be returned. else: fields = [x.strip() for x in fields.split(",")] else: fields = self._default_fields return fields
def _alias_input_args(self, args): alias_dict = dict([(_arg, _setting['alias']) for (_arg, _setting) in self.kwarg_settings.items() if 'alias' in _setting]) for (target, src) in alias_dict.items(): if is_str(src) and src in args: args.setdefault(target, args[src]) elif is_seq(src): for param in src: if param in args: args.setdefault(target, args[param]) break return args
def set_debug(self, left, label, right): """Set debug (left, right) debug values for the structure""" # lowercase left and right keys if is_str(left): left = left.lower() if is_str(right): right = right.lower() # remove duplicates in the debug structure # - duplicates in the structure itself are # - handled elsewhere if isinstance(right, list): right = list(set(right)) # if there is only one element in the list, collapse if len(right) == 1: right = right.pop() # capture the label if it is used if label: right = (label, right) try: self.debug[left] = self.debug[left] + [right] except KeyError: self.debug[left] = [left, right]
def report(self, struct, drep, orig_struct=None): # if is_str(struct) or type(struct) in [dict, list]: # TODO: remove this line if is_str(struct) or isinstance(struct, (dict, list)): val = len(struct) else: val = struct drep[self.key]["_count"] += 1 if val is None: drep[self.key]["_none"] += 1 else: if val < drep[self.key]["_min"]: drep[self.key]["_min"] = val if val > drep[self.key]["_max"]: drep[self.key]["_max"] = val
def post(self, mapt, mode, clean): if isinstance(mapt, dict): for k in list(mapt.keys()): if is_str(k) and k.startswith("__"): if k == "__vals" and mode == "deepstats": if len(mapt["__vals"]) > 1: mapt["_stdev"] = statistics.stdev(mapt["__vals"]) mapt["_median"] = statistics.median(mapt["__vals"]) mapt["_mean"] = statistics.mean(mapt["__vals"]) if clean: mapt.pop(k) else: self.post(mapt[k], mode, clean) elif isinstance(mapt, list): for e in mapt: self.post(e, mode, clean)
def _cleaned_scopes(self, scopes): """return a cleaned scopes parameter. should be either a string or a list of scope fields. """ if scopes: if is_str(scopes): scopes = [x.strip() for x in scopes.split(",")] if is_seq(scopes): scopes = [x for x in scopes if x] if len(scopes) == 1: scopes = scopes[0] else: scopes = None else: scopes = None return scopes
def _configure_by_kwargs(self, **kwargs): self.settings_str = """from biothings.www.settings.default import *\nfrom biothings.www.api.es.handlers import *\n""" for (k, v) in kwargs.items(): if k == 'APP_LIST': self.settings_str += '{k}=['.format(k=k) for (reg, handler_str) in v: self.settings_str += "(r'{reg}', {handler}),".format( reg=reg, handler=handler_str) self.settings_str += ']\n' elif k in [ 'ES_QUERY_BUILDER', 'ES_QUERY', 'ES_RESULT_TRANSFORMER' ]: self.settings_str += '{k}={v}\n' elif is_str(v): self.settings_str += '{k}="{v}"\n'.format(k=k, v=v) else: self.settings_str += '{k}={v}\n'
def validate_src(self, collection, return_false=False, return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False): '''Validate hgvs ids from a src collection.''' return_dict = { False: return_false, True: return_true, None: return_none } # read in the collection from mongodb if is_str(collection): src = get_src_db() _coll = src[collection] else: _coll = collection cursor = doc_feeder(_coll, step=10000) out = {} print_only = not (return_false or return_none or return_true) if not print_only: # output dictionary, three keys: 'false','true','none' for k in return_dict: if return_dict[k]: out[k] = [] # initialize the count cnt_d = {True: 0, False: 0, None: 0} # cnt_d # validate each item in the cursor for item in cursor: _id = item['_id'] valid = self.validate_hgvs(_id, verbose=verbose) if valid == False and flag_invalid: collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}}) cnt_d[valid] += 1 if return_dict[valid]: out[valid].append(_id) # print out counts print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True])) print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False])) print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None])) out['summary'] = cnt_d return out
def _get_cleaned_timestamp(self, timestamp): if is_str(timestamp): timestamp = datetime.strptime(timestamp, '%Y%m%d') assert isinstance(timestamp, datetime) return timestamp
def inspect(struct, key=None, mapt=None, mode="type", level=0, logger=logging): """ Explore struct and report types contained in it. Args: struct: is the data structure to explore mapt: if not None, will complete that type map with passed struct. This is useful when iterating over a dataset of similar data, trying to find a good type summary contained in that dataset. level: is for internal purposes, mostly debugging mode: see inspect_docs() documentation """ mode_inst = get_mode_layer(mode) # init recording structure if none were passed if mapt is None: mapt = {} # if type(struct) == dict: # TODO: remove this line if isinstance(struct, dict): # was this struct already explored before ? was it a list for that previous doc ? # then we have to pretend here it's also a list even if not, because we want to # report the list structure for k in struct: if mapt and list in mapt: # and key == k: already_explored_as_list = True else: already_explored_as_list = False if False: # already_explored_as_list: # TODO: check this mapt[list].setdefault(k, {}) typ = inspect(struct[k], key=k, mapt=mapt[list][k], mode=mode, level=level + 1) mapt[list].update({k: typ}) else: mapt.setdefault(k, {}) typ = inspect(struct[k], key=k, mapt=mapt[k], mode=mode, level=level + 1) if mode_inst: mapt.setdefault(mode_inst.key, copy.deepcopy(mode_inst.template[mode_inst.key])) mode_inst.report(1, mapt, struct) elif type(struct) == list: mapl = {} for e in struct: typ = inspect(e, key=key, mapt=mapl, mode=mode, level=level + 1) mapl.update(typ) if mode_inst: # here we just report that one document had a list mapl.update(copy.deepcopy(mode_inst.template)) mode_inst.report(struct, mapl) # if mapt exist, it means it's been explored previously but not as a list, # instead of mixing dict and list types, we want to normalize so we merge the previous # struct into that current list if mapt and list in mapt: mapt[list] = merge_record(mapt[list], mapl, mode) else: mapt.setdefault(list, {}) mapt[list].update(mapl) # elif is_scalar(struct) or type(struct) == datetime: # TODO: remove this line elif is_scalar(struct) or isinstance(struct, datetime): typ = type(struct) if mode == "type": mapt[typ] = {} elif mode == "mapping": # some type precedence processing... # splittable string ? if is_str(struct) and len(re.split(" +", struct.strip())) > 1: mapt[splitstr] = {} elif typ == bson.int64.Int64: mapt[int] = {} # we know struct is a scalar. NaN and Inf can't be indexed on ES, # need to catch those elif isinstance(struct, float) and math.isnan(struct): mapt[nan] = {} elif isinstance(struct, float) and math.isinf(struct): mapt[inf] = {} else: mapt[typ] = {} # splitstr > str if str in mapt and splitstr in mapt: mapt.pop(str) # float > int # TODO: could this be moved to es.generate_es_mapping ? if int in mapt and float in mapt: mapt.pop(int) else: mapt.setdefault(typ, copy.deepcopy(mode_inst.template)) mode_inst.report(struct, mapt[typ]) else: raise TypeError("Can't analyze type %s (data was: %s)" % (type(struct), struct)) return mapt
def generate_json_schema(dmap): scalarmap = { str: "string", int: "integer", float: "number", bool: "boolean", bson.int64.Int64: "number", None: "null", } def merge_type(typ1, typ2): if isinstance(typ1, list): if isinstance(typ2, list): typ1.extend(typ2) else: typ1.append(typ2) elif isinstance(typ2, list): typ1 = [typ1] + typ1 else: typ1 = [typ1, typ2] return list(set(typ1)) schema = {} if isinstance(dmap, dict): for k in dmap: if is_str(k): esch = generate_json_schema(dmap[k]) if schema: if schema["type"] == "object": # we just complete 'properties', key already defined previously pass elif schema["type"] == "array": if not schema.get("properties"): schema["properties"] = {} schema["type"] = merge_type(schema["type"], "object") elif isinstance(schema["type"], list): assert set(schema["type"]) == {"object", "array"} else: raise Exception("Previous schema type not expected: %s" % schema["type"]) else: schema = {"type": "object", "properties": {}} schema["properties"][k] = esch # elif type(k) == type: # TODO: remove this line elif isinstance(k, type): if k == list: if schema: # already defined for this key, mixed types schema.update({"items": {}}) schema["type"] = merge_type(schema["type"], "array") else: schema = {"type": "array", "items": {}} esch = generate_json_schema(dmap[k]) schema["items"] = generate_json_schema(dmap[k]) else: if schema: schema["type"] = merge_type(schema["type"], scalarmap[k]) else: schema = {"type": scalarmap[k]} elif k is None: schema = {"type": None} else: raise Exception("no not here, k: %s" % k) else: pass return schema