def fidsToSequences(fidlist, config): ''' Given a list of feature IDs, returns a dictionary from FID to its amino acid sequence. @note Features with no amino acid sequence are discarded. @param List of feature IDs @param config Dictionary of configuration variables @return Dictionary keyed by feature ID of amino acid sequence for feature ''' cdmi = CDMI_API(config["cdmi_url"]) fidlist = list(set(fidlist)) start = 0 increment = 5000 end = start + increment counter = len(fidlist) seqs = {} while counter > 0: try: ps = cdmi.fids_to_protein_sequences(fidlist[start:end]) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue seqs.update(ps) # Move to next sub-list start += increment end += increment if end >= len(fidlist): end = len(fidlist) counter -= increment return seqs
def genelist2fs(gl): qid2cds = ids2cds(gl) fs = { "description": "Feature set generated by " + ",".join(gl), "elements": {} } cdmie = CDMI_EntityAPI(URLS.cdmi) cdmic = CDMI_API(URLS.cdmi) cds_ids = qid2cds.values() cds2l = cds2locus(cds_ids) lfunc = cdmic.fids_to_functions(cds2l.values()) fm = cdmie.get_entity_Feature( cds_ids, ['feature_type', 'source_id', 'sequence_length', 'function', 'alias']) for i in cds_ids: if i in fm: if not fm[i]['function'] and cds2l[i] in lfunc: fm[i]['function'] = lfunc[cds2l[i]] fs['elements'][i] = { "data": { 'type': fm[i]['feature_type'], 'id': i, 'dna_sequence_length': int(fm[i]['sequence_length']), 'function': fm[i]['function'], 'aliases': fm[i]['alias'] } } return fs
def subsystemFids(count, config): ''' Query the CDMI for a list of feature IDs in the subsystems. @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return List of subsystem feature IDs ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Get the genes that are in subsystems and in OTUs. ssdict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_Subsystem(start, count, ["id"]) ssdict.update(subdict) start += count if len(subdict) < count: done = True ssids = getFieldFromEntity(ssdict, "id") sys.stderr.write('Found %d subsystems\n' %(len(ssids))) # Now lets get a list of FIDs within those subsystems # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 10 end = start + increment counter = len(ssids) ssfids = [] while counter > 0: try: ssfiddict = cdmi.subsystems_to_fids(ssids[start:end], []) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue for key in ssfiddict: for ssfid in ssfiddict[key]: ls = ssfiddict[key][ssfid] for arr in ls: if len(arr) > 1: gl = arr[1] for l in gl: ssfids.append(l) # Move to next sub-list start += increment end += increment if end >= len(ssids): end = len(ssids) counter -= increment # Uniquify! return list(set(ssfids))
def ids2cds(ql): cdmic = CDMI_API(URLS.cdmi) idm = IdMap(URLS.idmap) gl = set() rd = {} eids = [] lids = set() mids = set() for gid in ql: rd[gid] = gid if 'kb|g.' in gid: if 'locus' in gid: lids.add(gid) elif 'mRNA' in gid: mids.add(gid) else: eids.append(gid) sid2fids = cdmic.source_ids_to_fids(eids) for sid in sid2fids: for fid in sid2fids[sid]: rd[sid] = fid if 'locus' in fid: lids.add(fid) elif 'mRNA' in fid: mids.add(fid) lidmap = () if len(lids) > 0: lidmap = idm.longest_cds_from_locus(list(lids)) for lid in lidmap: for k in lidmap[lid]: gl.add(k) midl = list(mids) midmap = () if len(mids) > 0: lidmap = idm.longest_cds_from_mrna(list(mids)) for lid in midmap: for k in midmap[lid]: gl.add(k) for gid in ql: if 'kb|g.' in gid: if 'locus' in gid: for k in lidmap[gid]: rd[gid] = k elif 'mRNA' in gid: for k in midmap[gid]: rd[gid] = k else: if 'locus' in rd[gid]: for k in lidmap[rd[gid]]: rd[gid] = k elif 'mRNA' in rd[gid]: for k in midmap[rd[gid]]: rd[gid] = k return rd
def getGenomefeatures(ref,auth): gid = [ref] cdmic = CDMI_API(OTHERURLS.cdmi) gfids = cdmic.genomes_to_fids(gid,['CDS']) output = StringIO.StringIO() for item in gfids.values(): locs = cdmic.fids_to_locations(item) for key in locs.keys(): # print(locs[key][0][0] + "\t" + str(locs[key][0][1]) + "\t" + str(int(locs[key][0][1]) + int(locs[key][0][3])) + "\t" + locs[key][0][2] + "\t" + key,file = entityfile) print >>output, locs[key][0][0] + "\t" + str(locs[key][0][1]) + "\t" + str(int(locs[key][0][1]) + int(locs[key][0][3])) + "\t" + locs[key][0][2] + "\t" + key return output
def getOtuGenomeDictionary(count, config): ''' Obtain a dictionary from OTU representatives to all genomes in the OTU. @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return Dictionary keyed by OTU representative of list of OTU members ''' cdmi = CDMI_API(config["cdmi_url"]) # Get list of OTUs otulist = getOtuGenomeIds(count, config) otudict = cdmi.otu_members(otulist[0]) return otudict
def getDlitFids(count, config): ''' Query the CDMI for a list of feature IDs with direct literature evidence (dlits). @param count Number of entities to retrieve in each function call @param config Dictionary of configuration variables @return List of literature feature IDs ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) pubdict = dict() start = 0 done = False while not done: subdict = cdmi_entity.all_entities_Publication(start, count, ["id"]) pubdict.update(subdict) start += count if len(subdict) < count: done = True pubids = getFieldFromEntity(pubdict, "id") sys.stderr.write("Found %d publication IDs\n" %(len(pubids))) pub2seq = cdmi_entity.get_relationship_Concerns(pubids, [], [], ["id"]) pubseqs = getFieldFromRelationship(pub2seq, "id", "to") sys.stderr.write("Found %d protein sequences from publications\n" %(len(pubseqs))) seq2fids = cdmi_entity.get_relationship_IsProteinFor(pubseqs, [], [], ["id"]) fids = getFieldFromRelationship(seq2fids, "id", "to") return fids
def fidsToRoles(fidlist, config): ''' Given a list of feature IDs return a dictionary from FID to the list of roles the encoding gene performs and a dictionary from roles to the FIDs performing them. @param fidlist List of feature IDs @param config Dictionary of configuration variables @return Dictionary keyed by feature ID of list of roles encoding gene performs, dictionary keyed by role of list of feature IDs performing the role ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 1000 end = start + increment counter = len(fidlist) fidsToRoles = {} rolesToFids = {} while counter > 0: try: roledict = cdmi_entity.get_relationship_HasFunctional(fidlist[start:end], [], [], ["id"]) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue flist = getFieldFromRelationship(roledict, "from_link", "rel") rolelist = getFieldFromRelationship(roledict, "id", "to") for ii in range(len(flist)): # We have to use sets here because a bug(?) in get_relationship_HasFunctional allows multiple identical # links between fids and roles. # See for example what happens when you call it on g.9647.peg.2332 if flist[ii] in fidsToRoles: fidsToRoles[flist[ii]].add(rolelist[ii]) else: fidsToRoles[flist[ii]] = set([rolelist[ii]]) if rolelist[ii] in rolesToFids: rolesToFids[rolelist[ii]].add(flist[ii]) else: rolesToFids[rolelist[ii]] = set([flist[ii]]) # Move to next sub-list start += increment end += increment if end >= len(fidlist): end = len(fidlist) counter -= increment # Convert back to lists to not break other functions. for f in fidsToRoles: fidsToRoles[f] = list(fidsToRoles[f]) for r in rolesToFids: rolesToFids[r] = list(rolesToFids[r]) return fidsToRoles, rolesToFids
def genelist2fs(gl): qid2cds = ids2cds(gl) fs = {"description" : "Feature set generated by " + ",".join(gl), "elements" : {} } cdmie = CDMI_EntityAPI(URLS.cdmi) cdmic = CDMI_API(URLS.cdmi) cds_ids = qid2cds.values() cds2l = cds2locus(cds_ids); lfunc = cdmic.fids_to_functions(cds2l.values()) fm = cdmie.get_entity_Feature(cds_ids,['feature_type', 'source_id', 'sequence_length', 'function', 'alias']) for i in cds_ids: if i in fm: if not fm[i]['function'] and cds2l[i] in lfunc: fm[i]['function'] = lfunc[cds2l[i]] fs['elements'][i] = {"data" : { 'type' : fm[i]['feature_type'], 'id' : i, 'dna_sequence_length' : int(fm[i]['sequence_length']), 'function' : fm[i]['function'], 'aliases' : fm[i]['alias']}} return fs
def go_anno_net(meth, net_obj_id=None): """Add Gene Ontology annotation to network gene nodes :param net_obj_id: Network object id :type net_obj_id: kbtypes.KBaseNetworks.Network :return: Workspace id :rtype: kbtypes.Unicode :output_widget: ValueListWidget """ meth.stages = 5 meth.advance("Prepare annotation service") #gc = GWAS(URLS.gwas, token=meth.token) # load from current or other workspace wsid = meth.workspace_id # save to current workspace ws_save_id = meth.workspace_id meth.advance("Load network object") wsd = Workspace2(token=meth.token, wsid=wsid) oc = Ontology(url=URLS.ontology) net_object = wsd.get(net_obj_id) nc = Node(net_object['nodes'], net_object['edges']) idc = IDServerAPI(URLS.ids) cdmic = CDMI_API(URLS.cdmi) cdmie = CDMI_EntityAPI(URLS.cdmi) #idm = IdMap(URLS.idmap) gids = [ i for i in sorted(nc.ugids.keys()) if 'CDS' in i or 'locus' in i or ( not 'clst' in i and not i.startswith('cluster') and 'ps.' not in i) ] meth.advance("Get relationships from central data model") #eids = idc.kbase_ids_to_external_ids(gids) eids = kb_id2ext_id(idc, gids, 100) gids2cds = ids2cds(gids) cgids = gids2cds.values() cds2l = cds2locus(cgids) #mrnas_l = cdmie.get_relationship_Encompasses(gids, [], ['to_link'], []) #mrnas = dict((i[1]['from_link'], i[1]['to_link']) for i in mrnas_l) #locus_l = cdmie.get_relationship_Encompasses(mrnas.values(), [], ['to_link'], []) #locus = dict((i[1]['from_link'], i[1]['to_link']) for i in locus_l) #lgids = [locus[mrnas[i]] for i in gids if i in mrnas.keys()] # ignore original locus ids in gids lgids = cds2l.values() meth.advance("Annotate ({:d} nodes, {:d} edges)".format( len(net_object['nodes']), len(net_object['edges']))) #ots = oc.get_goidlist(lgids, ['biological_process'], ['IEA']) ots = oc.get_goidlist(cgids, [], []) oan = () #oc.get_go_annotation(lgids) funcs = cdmic.fids_to_functions(lgids) funcs_org = cdmic.fids_to_functions(cgids) annotate_nodes(net_object, ots=ots, oan=oan, funcs=funcs, funcs_org=funcs_org, eids=eids, gids2cds=gids2cds, cds2l=cds2l) meth.advance("Save annotated object to workspace {}".format(ws_save_id)) obj = { 'type': 'KBaseNetworks.Network', 'data': net_object, 'name': net_obj_id + ".ano", 'meta': { 'original': net_obj_id } } wsd.save_objects({'workspace': ws_save_id, 'objects': [obj]}) return _workspace_output(net_obj_id + ".ano")
def featureset_go_anal(meth, feature_set_id=None, p_value=0.05, ec='IEA', domain='biological_process', out_id=None): """This method annotate GO terms and execute GO enrichment test :param feature_set_id: FeatureSet workspace object id :type feature_set_id: kbtypes.KBaseSearch.FeatureSet :param p_value: p-value cutoff :type p_value: kbtypes.Unicode :param ec: Evidence code list (comma separated, IEA,ISS,IDA,IEP,IPI,RCA ..) :type ec:kbtypes.Unicode :param domain: Domain list (comma separated, biological_process,molecular_function,cellular_component) :type domain: kbtypes.Unicode :param out_id: Output FeatureSet object identifier :type out_id: kbtypes.KBaseSearch.FeatureSet :return: New workspace object :rtype: kbtypes.Unicode :output_widget: GeneTableWidget """ meth.stages = 4 meth.advance("Prepare Enrichment Test") oc = Ontology(url=URLS.ontology) ws = Workspace2(token=meth.token, wsid=meth.workspace_id) fs = ws.get(feature_set_id) qid2cds = ids2cds(fs['elements'].keys()) cds2l = cds2locus(qid2cds.values()) cdmic = CDMI_API(URLS.cdmi) lfunc = cdmic.fids_to_functions(cds2l.values()) meth.advance("Annotate GO Term") ec = ec.replace(" ", "") domain = domain.replace(" ", "") ec_list = [i for i in ec.split(',')] domain_list = [i for i in domain.split(',')] ots = oc.get_goidlist(list(set(qid2cds.values())), domain_list, ec_list) go_key = lambda go, i, ext: "go.{}.{:d}.{}".format(go, i, ext) go2cds = {} for gid in fs['elements']: lid = qid2cds[gid] if 'data' in fs['elements'][gid]: if not fs['elements'][gid]['data']['function']: fs['elements'][gid]['data']['function'] = lfunc[cds2l[lid]] if 'metadata' not in fs['elements'][gid]: fs['elements'][gid]['metadata'] = {} if lid in ots: go_enr_list = [] for lcnt, go in enumerate(ots[lid].keys()): if go not in go2cds: go2cds[go] = set() go2cds[go].add(lid) for i, goen in enumerate(ots[lid][go]): for ext in "domain", "ec", "desc": fs['elements'][gid]['metadata'][go_key( go, i, ext)] = goen[ext] fs['elements'][gid]['metadata'][go_key( go, i, ext)] = goen[ext] meth.advance("Execute Enrichment Test") enr_list = oc.get_go_enrichment(list(set(qid2cds.values())), domain_list, ec_list, 'hypergeometric', 'GO') enr_list = sorted(enr_list, key=itemgetter('pvalue'), reverse=False) header = [ "GO ID", "Description", "Domain", "p-value", "FeatureSet ID (# genes)" ] fields = [] objects = [] go_enr_smry = "" for i in range(len(enr_list)): goen = enr_list[i] if goen['pvalue'] > float(p_value): continue cfs = genelist2fs(list(go2cds[goen['goID']])) goid = goen['goID'].replace(":", "") fields.append([ goen['goID'], goen['goDesc'][0], goen['goDesc'][1], "{:12.10f}".format(goen['pvalue']), "{}_to_{} ({})".format(out_id, goid, len(go2cds[goen['goID']])) ]) objects.append({ 'type': 'KBaseSearch.FeatureSet', 'data': cfs, 'name': out_id + "_to_" + goid, 'meta': { 'original': feature_set_id, 'domain': domain, 'ec': ec, 'GO_ID': goen['goID'] } }) if i < 3: go_enr_smry += goen['goID'] + "(" + "{:6.4f}".format( goen['pvalue']) + ")" + goen['goDesc'][0] + "\n" go_enr_smry data = {'table': [header] + fields} meth.advance("Saving output to Workspace") objects.append({ 'type': 'KBaseSearch.FeatureSet', 'data': fs, 'name': out_id, 'meta': { 'original': feature_set_id, 'enr_summary': go_enr_smry } }) ws.save_objects({'workspace': meth.workspace_id, 'objects': objects}) return json.dumps(data)
def go_anno_net(meth, net_obj_id=None): """Add Gene Ontology annotation to network gene nodes :param net_obj_id: Network object id :type net_obj_id: kbtypes.KBaseNetworks.Network :return: Workspace id :rtype: kbtypes.Unicode :output_widget: ValueListWidget """ meth.stages = 5 meth.advance("Prepare annotation service") #gc = GWAS(URLS.gwas, token=meth.token) # load from current or other workspace wsid = meth.workspace_id # save to current workspace ws_save_id = meth.workspace_id meth.advance("Load network object") wsd = Workspace2(token=meth.token, wsid=wsid) oc = Ontology(url=URLS.ontology) net_object = wsd.get(net_obj_id) nc = Node(net_object['nodes'], net_object['edges']) idc = IDServerAPI(URLS.ids) cdmic = CDMI_API(URLS.cdmi) cdmie = CDMI_EntityAPI(URLS.cdmi) #idm = IdMap(URLS.idmap) gids = [i for i in sorted(nc.ugids.keys()) if 'CDS' in i or 'locus' in i or (not 'clst' in i and not i.startswith('cluster') and 'ps.' not in i )] meth.advance("Get relationships from central data model") #eids = idc.kbase_ids_to_external_ids(gids) eids = kb_id2ext_id(idc, gids, 100) gids2cds = ids2cds(gids) cgids = gids2cds.values() cds2l = cds2locus(cgids) #mrnas_l = cdmie.get_relationship_Encompasses(gids, [], ['to_link'], []) #mrnas = dict((i[1]['from_link'], i[1]['to_link']) for i in mrnas_l) #locus_l = cdmie.get_relationship_Encompasses(mrnas.values(), [], ['to_link'], []) #locus = dict((i[1]['from_link'], i[1]['to_link']) for i in locus_l) #lgids = [locus[mrnas[i]] for i in gids if i in mrnas.keys()] # ignore original locus ids in gids lgids = cds2l.values() meth.advance("Annotate ({:d} nodes, {:d} edges)".format( len(net_object['nodes']), len(net_object['edges']))) #ots = oc.get_goidlist(lgids, ['biological_process'], ['IEA']) ots = oc.get_goidlist(cgids, [], []) oan = () #oc.get_go_annotation(lgids) funcs = cdmic.fids_to_functions(lgids) funcs_org = cdmic.fids_to_functions(cgids) annotate_nodes(net_object, ots=ots, oan=oan, funcs=funcs, funcs_org=funcs_org, eids=eids, gids2cds=gids2cds, cds2l=cds2l) meth.advance("Save annotated object to workspace {}".format(ws_save_id)) obj = { 'type': 'KBaseNetworks.Network', 'data': net_object, 'name': net_obj_id + ".ano", 'meta': { 'original': net_obj_id } } wsd.save_objects({'workspace': ws_save_id, 'objects': [obj]}) return _workspace_output(net_obj_id + ".ano")
def featureset_go_anal(meth, feature_set_id=None, p_value=0.05, ec='IEA', domain='biological_process', out_id=None): """This method annotate GO terms and execute GO enrichment test :param feature_set_id: FeatureSet workspace object id :type feature_set_id: kbtypes.KBaseSearch.FeatureSet :param p_value: p-value cutoff :type p_value: kbtypes.Unicode :param ec: Evidence code list (comma separated, IEA,ISS,IDA,IEP,IPI,RCA ..) :type ec:kbtypes.Unicode :param domain: Domain list (comma separated, biological_process,molecular_function,cellular_component) :type domain: kbtypes.Unicode :param out_id: Output FeatureSet object identifier :type out_id: kbtypes.KBaseSearch.FeatureSet :return: New workspace object :rtype: kbtypes.Unicode :output_widget: GeneTableWidget """ meth.stages = 4 meth.advance("Prepare Enrichment Test") oc = Ontology(url=URLS.ontology) ws = Workspace2(token=meth.token, wsid=meth.workspace_id) fs = ws.get(feature_set_id) qid2cds = ids2cds(fs['elements'].keys()) cds2l = cds2locus(qid2cds.values()) cdmic = CDMI_API(URLS.cdmi) lfunc = cdmic.fids_to_functions(cds2l.values()) meth.advance("Annotate GO Term") ec = ec.replace(" ","") domain = domain.replace(" ","") ec_list = [ i for i in ec.split(',')] domain_list = [ i for i in domain.split(',')] ots = oc.get_goidlist(list(set(qid2cds.values())), domain_list, ec_list) go_key = lambda go, i, ext: "go.{}.{:d}.{}".format(go, i, ext) go2cds = {} for gid in fs['elements']: lid = qid2cds[gid] if 'data' in fs['elements'][gid]: if not fs['elements'][gid]['data']['function']: fs['elements'][gid]['data']['function'] = lfunc[cds2l[lid]] if 'metadata' not in fs['elements'][gid]: fs['elements'][gid]['metadata'] = {} if lid in ots: go_enr_list = [] for lcnt, go in enumerate(ots[lid].keys()): if go not in go2cds: go2cds[go] = set() go2cds[go].add(lid) for i, goen in enumerate(ots[lid][go]): for ext in "domain", "ec", "desc": fs['elements'][gid]['metadata'][go_key(go, i, ext)] = goen[ext] fs['elements'][gid]['metadata'][go_key(go, i, ext)] = goen[ext] meth.advance("Execute Enrichment Test") enr_list = oc.get_go_enrichment(list(set(qid2cds.values())), domain_list, ec_list, 'hypergeometric', 'GO') enr_list = sorted(enr_list, key=itemgetter('pvalue'), reverse=False) header = ["GO ID", "Description", "Domain", "p-value", "FeatureSet ID (# genes)"] fields = [] objects = [] go_enr_smry = "" for i in range(len(enr_list)): goen = enr_list[i] if goen['pvalue'] > float(p_value) : continue cfs = genelist2fs(list(go2cds[goen['goID']])) goid = goen['goID'].replace(":","") fields.append([goen['goID'], goen['goDesc'][0], goen['goDesc'][1], "{:12.10f}".format(goen['pvalue']), "{}_to_{} ({})".format(out_id, goid,len(go2cds[goen['goID']])) ]) objects.append({'type' : 'KBaseSearch.FeatureSet', 'data' : cfs, 'name' : out_id + "_to_" + goid, 'meta' : {'original' : feature_set_id, 'domain' : domain, 'ec' : ec, 'GO_ID' :goen['goID']}}) if i < 3 : go_enr_smry += goen['goID']+"(" + "{:6.4f}".format(goen['pvalue']) + ")" + goen['goDesc'][0] + "\n" go_enr_smry data = {'table': [header] + fields} meth.advance("Saving output to Workspace") objects.append({'type' : 'KBaseSearch.FeatureSet', 'data' : fs, 'name' : out_id, 'meta' : {'original' : feature_set_id, 'enr_summary' : go_enr_smry}}) ws.save_objects({'workspace' : meth.workspace_id, 'objects' :objects}) return json.dumps(data)