Ejemplo n.º 1
0
    def _children_query(self, ids, has_gene=True, include_self=False, raw=False):
        if is_str(ids) or isinstance(ids, int) or (is_seq(ids) and len(ids) == 1):
            _ids = ids if is_str(ids) or isinstance(ids, int) else ids[0] 
            _qstring = "lineage:{} AND has_gene:true".format(_ids) if has_gene else "lineage:{}".format(_ids)
            res = self.options.es_client.search(body={"query":{"query_string":{"query": _qstring}}},
                index=self.options.index, doc_type=self.options.doc_type, fields='_id', size=self.max_taxid_count)
            
            if raw:
                return res
            
            taxid_li = [int(x['_id']) for x in res['hits']['hits'] if x['_id'] != _ids or include_self]
            taxid_li += ([_ids] if include_self and _ids not in taxid_li else [])        
            return {_ids: sorted(taxid_li)[:self.max_taxid_count]}
        elif is_seq(ids):
            qs = '\n'.join(['{{}}\n{{"size": {}, "_source": ["_id"], "query": {{"query_string":{{"query": "lineage:{} AND has_gene:true"}}}}}}'.format(self.max_taxid_count, taxid) if has_gene
                else '{{}}\n{{"size": {}, "_source": ["_id"], "query":{{"query_string":{{"query":"lineage:{}"}}}}}}'.format(self.max_taxid_count, taxid) for taxid in ids])
            res = self.options.es_client.msearch(body=qs, index=self.options.index, doc_type=self.options.doc_type)
            if 'responses' not in res or len(res['responses']) != len(ids):
                return {}
            
            _ret = {}

            for (taxid, response) in zip(ids, res['responses']):
                _ret.setdefault(taxid, []).extend([h['_id'] for h in response['hits']['hits'] 
                                                    if h['_id'] != taxid or include_self])
            for taxid in _ret.keys():
                _ret[taxid] = sorted([int(x) for x in list(set(_ret[taxid]))] + 
                    ([int(taxid)] if include_self and taxid not in _ret[taxid] else []))[:self.max_taxid_count]
            return _ret
        else:
            return {}
Ejemplo n.º 2
0
 def build_id_query(self, bid, scopes=None):
     _default_scopes = '_id'
     scopes = scopes or _default_scopes
     if is_str(scopes):
         _query = {
             "match": {
                 scopes: {
                     "query": "{}".format(bid),
                     "operator": "and"
                 }
             }
         }
     elif is_seq(scopes):
         _query = {
             "multi_match": {
                 "query": "{}".format(bid),
                 "fields": scopes,
                 "operator": "and"
             }
         }
     else:
         raise ValueError('"scopes" cannot be "%s" type'.format(type(scopes)))
     _q = {"query": _query}
     self._query_options.pop("query", None)    # avoid "query" be overwritten by self.query_options
     _q.update(self._query_options)
     return _q
Ejemplo n.º 3
0
 def return_raw_query_json(self, query):
     '''Return valid JSON if `rawquery` option is selected.
     This is necessary as queries can span multiple lines (POST)'''
     _ret = query.get('body', {'GET': query.get('bid')})
     if is_str(_ret) and len(_ret.split('\n')) > 1:
         self.return_json({'body': _ret})
     else:
         self.return_json(_ret)
Ejemplo n.º 4
0
 def transfer_debug(self, key, other):
     """
     transfer debug information for one key in the IDStruct object
     """
     # ensure lower case key
     if is_str(key):
         key = key.lower()
     # transfer debug information
     self.debug[key] = other.get_debug(key)
Ejemplo n.º 5
0
 def return_raw_query_json(self, query, status_code=200, _format='json'):
     '''Return valid JSON if `rawquery` option is selected.
     This is necessary as queries can span multiple lines (POST)'''
     _ret = query.get('body', {'GET': query.get('bid')})
     if is_str(_ret) and len(_ret.split('\n')) > 1:
         self.return_object({'body': _ret},
                            status_code=status_code,
                            _format=_format)
     else:
         self.return_object(_ret, status_code=status_code, _format=_format)
Ejemplo n.º 6
0
    def get_changes(self, source_col, use_parallel=True):
        target_col = self._target_col
        source_col = self._db[source_col] if is_str(source_col) else source_col

        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        changes = diff_collections(target, src, use_parallel=use_parallel, step=self.step)
        if changes:
            changes['source'] = source_col.name
            changes['timestamp'] = _get_timestamp(source_col.name)
        return changes
Ejemplo n.º 7
0
    def validate_src(self,
                     collection,
                     return_false=False,
                     return_none=False,
                     return_true=False,
                     verbose=False,
                     flag_invalid=False,
                     generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}  # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id},
                                  {'$set': {
                                      "unmatched_ref": "True"
                                  }})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Ejemplo n.º 8
0
 def get_debug(self, key):
     """Get debug information for a given key"""
     # lowercase key if possible
     if is_str(key):
         key = key.lower()
     # return debug information
     if isinstance(key, list):
         return 'type(list)'
     try:
         return self.debug[key]
     except KeyError:
         return 'not-available'
Ejemplo n.º 9
0
 def _cleaned_fields(self, fields):
     """return a cleaned fields parameter.
         should be either None (return all fields) or a list fields.
     """
     if fields:
         if is_str(fields):
             if fields.lower() == "all":
                 fields = None  # all fields will be returned.
             else:
                 fields = [x.strip() for x in fields.split(",")]
     else:
         fields = self._default_fields
     return fields
Ejemplo n.º 10
0
 def _alias_input_args(self, args):
     alias_dict = dict([(_arg, _setting['alias'])
                        for (_arg, _setting) in self.kwarg_settings.items()
                        if 'alias' in _setting])
     for (target, src) in alias_dict.items():
         if is_str(src) and src in args:
             args.setdefault(target, args[src])
         elif is_seq(src):
             for param in src:
                 if param in args:
                     args.setdefault(target, args[param])
                     break
     return args
Ejemplo n.º 11
0
 def set_debug(self, left, label, right):
     """Set debug (left, right) debug values for the structure"""
     # lowercase left and right keys
     if is_str(left):
         left = left.lower()
     if is_str(right):
         right = right.lower()
     # remove duplicates in the debug structure
     # - duplicates in the structure itself are
     # - handled elsewhere
     if isinstance(right, list):
         right = list(set(right))
         # if there is only one element in the list, collapse
         if len(right) == 1:
             right = right.pop()
     # capture the label if it is used
     if label:
         right = (label, right)
     try:
         self.debug[left] = self.debug[left] + [right]
     except KeyError:
         self.debug[left] = [left, right]
Ejemplo n.º 12
0
 def report(self, struct, drep, orig_struct=None):
     # if is_str(struct) or type(struct) in [dict, list]:    # TODO: remove this line
     if is_str(struct) or isinstance(struct, (dict, list)):
         val = len(struct)
     else:
         val = struct
     drep[self.key]["_count"] += 1
     if val is None:
         drep[self.key]["_none"] += 1
     else:
         if val < drep[self.key]["_min"]:
             drep[self.key]["_min"] = val
         if val > drep[self.key]["_max"]:
             drep[self.key]["_max"] = val
Ejemplo n.º 13
0
    def get_changes(self, source_col, use_parallel=True):
        target_col = self._target_col
        source_col = self._db[source_col] if is_str(source_col) else source_col

        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        changes = diff_collections(target,
                                   src,
                                   use_parallel=use_parallel,
                                   step=self.step)
        if changes:
            changes['source'] = source_col.name
            changes['timestamp'] = _get_timestamp(source_col.name)
        return changes
Ejemplo n.º 14
0
 def post(self, mapt, mode, clean):
     if isinstance(mapt, dict):
         for k in list(mapt.keys()):
             if is_str(k) and k.startswith("__"):
                 if k == "__vals" and mode == "deepstats":
                     if len(mapt["__vals"]) > 1:
                         mapt["_stdev"] = statistics.stdev(mapt["__vals"])
                         mapt["_median"] = statistics.median(mapt["__vals"])
                         mapt["_mean"] = statistics.mean(mapt["__vals"])
                 if clean:
                     mapt.pop(k)
             else:
                 self.post(mapt[k], mode, clean)
     elif isinstance(mapt, list):
         for e in mapt:
             self.post(e, mode, clean)
Ejemplo n.º 15
0
 def _cleaned_scopes(self, scopes):
     """return a cleaned scopes parameter.
         should be either a string or a list of scope fields.
     """
     if scopes:
         if is_str(scopes):
             scopes = [x.strip() for x in scopes.split(",")]
         if is_seq(scopes):
             scopes = [x for x in scopes if x]
             if len(scopes) == 1:
                 scopes = scopes[0]
         else:
             scopes = None
     else:
         scopes = None
     return scopes
Ejemplo n.º 16
0
 def _configure_by_kwargs(self, **kwargs):
     self.settings_str = """from biothings.www.settings.default import *\nfrom biothings.www.api.es.handlers import *\n"""
     for (k, v) in kwargs.items():
         if k == 'APP_LIST':
             self.settings_str += '{k}=['.format(k=k)
             for (reg, handler_str) in v:
                 self.settings_str += "(r'{reg}', {handler}),".format(
                     reg=reg, handler=handler_str)
             self.settings_str += ']\n'
         elif k in [
                 'ES_QUERY_BUILDER', 'ES_QUERY', 'ES_RESULT_TRANSFORMER'
         ]:
             self.settings_str += '{k}={v}\n'
         elif is_str(v):
             self.settings_str += '{k}="{v}"\n'.format(k=k, v=v)
         else:
             self.settings_str += '{k}={v}\n'
Ejemplo n.º 17
0
    def validate_src(self, collection, return_false=False,
                     return_none=False, return_true=False, verbose=False, flag_invalid=False, generator=False):
        '''Validate hgvs ids from a src collection.'''

        return_dict = {
            False: return_false,
            True: return_true,
            None: return_none
        }

        # read in the collection from mongodb
        if is_str(collection):
            src = get_src_db()
            _coll = src[collection]
        else:
            _coll = collection
        cursor = doc_feeder(_coll, step=10000)

        out = {}
        print_only = not (return_false or return_none or return_true)
        if not print_only:
            # output dictionary, three keys: 'false','true','none'
            for k in return_dict:
                if return_dict[k]:
                    out[k] = []

        # initialize the count
        cnt_d = {True: 0, False: 0, None: 0}    # cnt_d
        # validate each item in the cursor
        for item in cursor:
            _id = item['_id']
            valid = self.validate_hgvs(_id, verbose=verbose)
            if valid == False and flag_invalid:
                collection.update({"_id": _id}, {'$set':{"unmatched_ref": "True"}})
            cnt_d[valid] += 1
            if return_dict[valid]:
                out[valid].append(_id)

        # print out counts
        print("\n# of VALID HGVS IDs:\t{0}".format(cnt_d[True]))
        print("# of INVALID HGVS IDs:\t{0}".format(cnt_d[False]))
        print("# of HGVS IDs skipped:\t {0}".format(cnt_d[None]))

        out['summary'] = cnt_d
        return out
Ejemplo n.º 18
0
 def _get_cleaned_timestamp(self, timestamp):
     if is_str(timestamp):
         timestamp = datetime.strptime(timestamp, '%Y%m%d')
     assert isinstance(timestamp, datetime)
     return timestamp
Ejemplo n.º 19
0
 def _get_cleaned_timestamp(self, timestamp):
     if is_str(timestamp):
         timestamp = datetime.strptime(timestamp, '%Y%m%d')
     assert isinstance(timestamp, datetime)
     return timestamp
Ejemplo n.º 20
0
def inspect(struct, key=None, mapt=None, mode="type", level=0, logger=logging):
    """
    Explore struct and report types contained in it.

    Args:
        struct: is the data structure to explore
        mapt: if not None, will complete that type map with passed struct. This is useful
              when iterating over a dataset of similar data, trying to find a good type summary
              contained in that dataset.
        level: is for internal purposes, mostly debugging
        mode: see inspect_docs() documentation
    """

    mode_inst = get_mode_layer(mode)

    # init recording structure if none were passed
    if mapt is None:
        mapt = {}

    # if type(struct) == dict:    # TODO: remove this line
    if isinstance(struct, dict):
        # was this struct already explored before ? was it a list for that previous doc ?
        # then we have to pretend here it's also a list even if not, because we want to
        # report the list structure
        for k in struct:
            if mapt and list in mapt:  # and key == k:
                already_explored_as_list = True
            else:
                already_explored_as_list = False
            if False:  # already_explored_as_list:      # TODO: check this
                mapt[list].setdefault(k, {})
                typ = inspect(struct[k],
                              key=k,
                              mapt=mapt[list][k],
                              mode=mode,
                              level=level + 1)
                mapt[list].update({k: typ})
            else:
                mapt.setdefault(k, {})
                typ = inspect(struct[k],
                              key=k,
                              mapt=mapt[k],
                              mode=mode,
                              level=level + 1)

        if mode_inst:
            mapt.setdefault(mode_inst.key,
                            copy.deepcopy(mode_inst.template[mode_inst.key]))
            mode_inst.report(1, mapt, struct)
    elif type(struct) == list:

        mapl = {}
        for e in struct:
            typ = inspect(e, key=key, mapt=mapl, mode=mode, level=level + 1)
            mapl.update(typ)
        if mode_inst:
            # here we just report that one document had a list
            mapl.update(copy.deepcopy(mode_inst.template))
            mode_inst.report(struct, mapl)
        # if mapt exist, it means it's been explored previously but not as a list,
        # instead of mixing dict and list types, we want to normalize so we merge the previous
        # struct into that current list
        if mapt and list in mapt:
            mapt[list] = merge_record(mapt[list], mapl, mode)
        else:
            mapt.setdefault(list, {})
            mapt[list].update(mapl)
    # elif is_scalar(struct) or type(struct) == datetime:   # TODO: remove this line
    elif is_scalar(struct) or isinstance(struct, datetime):
        typ = type(struct)
        if mode == "type":
            mapt[typ] = {}
        elif mode == "mapping":
            # some type precedence processing...
            # splittable string ?
            if is_str(struct) and len(re.split(" +", struct.strip())) > 1:
                mapt[splitstr] = {}
            elif typ == bson.int64.Int64:
                mapt[int] = {}
            # we know struct is a scalar. NaN and Inf can't be indexed on ES,
            # need to catch those
            elif isinstance(struct, float) and math.isnan(struct):
                mapt[nan] = {}
            elif isinstance(struct, float) and math.isinf(struct):
                mapt[inf] = {}
            else:
                mapt[typ] = {}
            # splitstr > str
            if str in mapt and splitstr in mapt:
                mapt.pop(str)
            # float > int
            # TODO: could this be moved to es.generate_es_mapping ?
            if int in mapt and float in mapt:
                mapt.pop(int)
        else:
            mapt.setdefault(typ, copy.deepcopy(mode_inst.template))
            mode_inst.report(struct, mapt[typ])
    else:
        raise TypeError("Can't analyze type %s (data was: %s)" %
                        (type(struct), struct))

    return mapt
Ejemplo n.º 21
0
def generate_json_schema(dmap):

    scalarmap = {
        str: "string",
        int: "integer",
        float: "number",
        bool: "boolean",
        bson.int64.Int64: "number",
        None: "null",
    }

    def merge_type(typ1, typ2):
        if isinstance(typ1, list):
            if isinstance(typ2, list):
                typ1.extend(typ2)
            else:
                typ1.append(typ2)
        elif isinstance(typ2, list):
            typ1 = [typ1] + typ1
        else:
            typ1 = [typ1, typ2]

        return list(set(typ1))

    schema = {}

    if isinstance(dmap, dict):
        for k in dmap:
            if is_str(k):
                esch = generate_json_schema(dmap[k])
                if schema:
                    if schema["type"] == "object":
                        # we just complete 'properties', key already defined previously
                        pass
                    elif schema["type"] == "array":
                        if not schema.get("properties"):
                            schema["properties"] = {}
                            schema["type"] = merge_type(schema["type"], "object")
                    elif isinstance(schema["type"], list):
                        assert set(schema["type"]) == {"object", "array"}
                    else:
                        raise Exception("Previous schema type not expected: %s" % schema["type"])

                else:
                    schema = {"type": "object", "properties": {}}
                schema["properties"][k] = esch
            # elif type(k) == type:    # TODO: remove this line
            elif isinstance(k, type):
                if k == list:
                    if schema:
                        # already defined for this key, mixed types
                        schema.update({"items": {}})
                        schema["type"] = merge_type(schema["type"], "array")
                    else:
                        schema = {"type": "array", "items": {}}
                    esch = generate_json_schema(dmap[k])
                    schema["items"] = generate_json_schema(dmap[k])
                else:
                    if schema:
                        schema["type"] = merge_type(schema["type"], scalarmap[k])
                    else:
                        schema = {"type": scalarmap[k]}
            elif k is None:
                schema = {"type": None}
            else:
                raise Exception("no not here, k: %s" % k)
    else:
        pass

    return schema