Example #1
0
def _generate_features(obj_ver_key, obj_data):
    d = obj_data['data']
    if not d.get('features'):
        logger.info(f'Genome {obj_ver_key} has no features')
        return
    verts = []
    edges = []
    wsid = obj_data['info'][6]
    objid = obj_data['info'][0]
    ver = obj_data['info'][4]
    # might want to do this in smaller batches if memory pressure is an issue
    for f in d['features']:
        feature_key = _clean_key(f'{obj_ver_key}_{f["id"]}')
        verts.append({
            '_key': feature_key,
            'workspace_id': wsid,
            'object_id': objid,
            'version': ver,
            'feature_id': f['id']
        })
        edges.append({
            '_key': f'{feature_key}',  # make a unique key so overwrites work
            '_from': f'{_OBJ_VER_COLL}/{obj_ver_key}',
            '_to': f'{_WS_FEAT_COLL}/{feature_key}'
        })
    logger.info(f'Saving {len(verts)} features for genome {obj_ver_key}')
    # hmm, this could leave the db in a corrupt state... options are 1) rollback 2) retry 3) leave
    # rollback is kind of impossible as an error here implies the re api isn't reachable
    # retry is doable, but should probably be implemented much higher in the stack
    # So 3 for now
    # reindexing will overwrite and fix
    _save(_WS_FEAT_COLL, verts)
    _save(_WS_FEAT_EDGE_COLL, edges)
Example #2
0
def process_sample_set(obj_ver_key: str, obj_data: dict) -> None:
    """
    obj_ver_key: object version key
    obj_data: object data
    """
    # term_bank dictionary for storing arango document information about
    # already encountered terms. mapping of ontology_term -> arango "_id" field
    term_bank: Dict[str, str] = {}
    edges: List[dict] = []
    # iterate per sample
    for sample_info in obj_data['data']['samples']:
        # retrieve the sample metadata
        sample = _get_sample(sample_info)
        sample_version_uuid = _get_sample_version_uuid(sample)
        # term_bank object and edges list passed by reference
        # find terms we know are ontology terms
        _generate_link_information(sample, sample_version_uuid, edges,
                                   term_bank)
    # add creation timestamp for edge link, (same for all edges).
    created_timestamp = _now_epoch_ms() + 20 * len(
        edges)  # allow 20 ms to transport & save each edge
    for e in edges:
        e['created'] = created_timestamp
    logger.info(f'Writing {len(edges)} sample -> ontology edges '
                f'for samples in SampleSet {obj_ver_key}')
    # save link in bulk operation
    _save(SAMPLE_ONTOLOGY_COLL, edges)
Example #3
0
def _generate_taxon_edge(obj_ver_key, obj_data):
    if 'taxon_ref' not in obj_data['data']:
        logger.info('No taxon ref in object; skipping..')
        return
    ws_client = WorkspaceClient(url=config()['kbase_endpoint'],
                                token=config()['ws_token'])
    result = ws_client.admin_req(
        'getObjects', {'objects': [{
            'ref': obj_data['data']['taxon_ref']
        }]})
    taxonomy_id = result['data'][0]['data']['taxonomy_id']
    adb_resp = _stored_query('ncbi_fetch_taxon', {
        'id': str(taxonomy_id),
        'ts': int(time.time() * 1000),
    })
    adb_results = adb_resp['results']
    if not adb_results:
        logger.info(f'No taxonomy node in database for id {taxonomy_id}')
        return
    tax_key = adb_results[0]['_key']
    # Create an edge from the ws_object_ver to the taxon
    from_id = f"{_OBJ_VER_COLL}/{obj_ver_key}"
    to_id = f"{_TAX_VER_COLL}/{tax_key}"
    logger.info(f'Creating taxon edge from {from_id} to {to_id}')
    _save(_TAX_EDGE_COLL, [{
        '_from': from_id,
        '_to': to_id,
        'assigned_by': '_system'
    }])
Example #4
0
def _generate_GO_links(obj_ver_key, obj_data):
    d = obj_data['data']
    if not d.get('features'):
        # no features logged already in _generate_features
        return
    f_to_go = {}
    for f in d['features']:
        # this works for Genome-8.2 to 10.0 in production
        if _ONTOLOGY_TERMS in f and _ONTOLOGY_GO_KEY in f[_ONTOLOGY_TERMS]:
            f_to_go[f['id']] = f[_ONTOLOGY_TERMS][_ONTOLOGY_GO_KEY].keys()
    terms_set = {i for items in f_to_go.values() for i in items}  # flatten
    query_time = _now_epoch_ms()
    # might want to do this in smaller batches if memory pressure is an issue
    resolved_terms = _resolve_GO_terms(terms_set, query_time)
    edges = []
    for f in f_to_go:
        for g in f_to_go[f]:
            if g not in resolved_terms:
                logger.info(f"Couldn't resolve GO term {g} in Genome {obj_ver_key} feature {f}")
            else:
                featurekey = _clean_key(f'{obj_ver_key}_{f}')
                edges.append({
                    '_key': f'{featurekey}::{resolved_terms[g]}::kbase_RE_indexer',
                    '_from': f'{_WS_FEAT_COLL}/{featurekey}',
                    '_to': f'{_GO_TERM_COLL}/{resolved_terms[g]}',
                    'source': 'kbase_RE_indexer',
                    'expired': _MAX_ADB_INTEGER
                })
    created_time = _now_epoch_ms() + 20 * len(edges)  # allow 20 ms to transport & save each edge
    for e in edges:
        e['created'] = created_time
    logger.info(f'Writing {len(edges)} feature -> GO edges for genome {obj_ver_key}')
    _save(_WS_FEAT_TO_GO_COLL, edges, on_duplicate='ignore')