def getGenomeNeighborhoodsAndRoles(genomes, config): cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) pegs = genomesToPegs(genomes) # Get contigs fidlocdict = cdmi_entity.get_relationship_IsLocatedIn(pegs, [], ["begin", "dir"], ["id"]) fids = getFieldFromRelationship(fidlocdict, "from_link", "rel") begins = getFieldFromRelationship(fidlocdict, "begin", "rel") dirs = getFieldFromRelationship(fidlocdict, "dir", "rel") cids = getFieldFromRelationship(fidlocdict, "id", "to") tuplist = [] for ii in range(len(cids)): tuplist.append( (cids[ii], fids[ii], int(begins[ii]), dirs[ii]) ) # Sort by contig first, then by start location. tuplist = sorted(tuplist, key=operator.itemgetter(0,2)) # Now lets get the role for all of these IDs # Note that a single protein can have multiple roles. roledict = cdmi_entity.get_relationship_HasFunctional(fids, [], [], ["id"]) fids = getFieldFromRelationship(roledict, "from_link", "rel") roles = getFieldFromRelationship(roledict, "id", "to") fidToRoles = {} rolesToFids = {} for ii in range(len(fids)): if fids[ii] in fidToRoles: fidToRoles[fids[ii]].append(roles[ii]) else: fidToRoles[fids[ii]] = [ roles[ii] ] if roles[ii] in rolesToFids: rolesToFids[roles[ii]].append(fids[ii]) else: rolesToFids[roles[ii]] = [ fids[ii] ] return tuplist, fidToRoles
def fidsToRoles(fidlist, config): ''' Given a list of feature IDs return a dictionary from FID to the list of roles the encoding gene performs and a dictionary from roles to the FIDs performing them. @param fidlist List of feature IDs @param config Dictionary of configuration variables @return Dictionary keyed by feature ID of list of roles encoding gene performs, dictionary keyed by role of list of feature IDs performing the role ''' cdmi = CDMI_API(config["cdmi_url"]) cdmi_entity = CDMI_EntityAPI(config["cdmi_url"]) # Break the complete list into smaller sub-lists to avoid timeouts start = 0 increment = 1000 end = start + increment counter = len(fidlist) fidsToRoles = {} rolesToFids = {} while counter > 0: try: roledict = cdmi_entity.get_relationship_HasFunctional(fidlist[start:end], [], [], ["id"]) except HTTPError as e: if increment > 1: increment = increment / 2 end = start + increment sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment)) continue flist = getFieldFromRelationship(roledict, "from_link", "rel") rolelist = getFieldFromRelationship(roledict, "id", "to") for ii in range(len(flist)): # We have to use sets here because a bug(?) in get_relationship_HasFunctional allows multiple identical # links between fids and roles. # See for example what happens when you call it on g.9647.peg.2332 if flist[ii] in fidsToRoles: fidsToRoles[flist[ii]].add(rolelist[ii]) else: fidsToRoles[flist[ii]] = set([rolelist[ii]]) if rolelist[ii] in rolesToFids: rolesToFids[rolelist[ii]].add(flist[ii]) else: rolesToFids[rolelist[ii]] = set([flist[ii]]) # Move to next sub-list start += increment end += increment if end >= len(fidlist): end = len(fidlist) counter -= increment # Convert back to lists to not break other functions. for f in fidsToRoles: fidsToRoles[f] = list(fidsToRoles[f]) for r in rolesToFids: rolesToFids[r] = list(rolesToFids[r]) return fidsToRoles, rolesToFids