Ejemplo n.º 1
0
def filterFidsByOtus(fidlist, otus, config):
    '''
    Obsolete (I think this isn't used any more)

    Given a list of representative organism IDs (OTUs) and a list of
    FIDs, returns only those FIDs found in an OTU.'''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each fid
    # If this fails to find an organism we don't want it anyway...
    orgdict = cdmi_entity.get_relationship_IsOwnedBy(fidlist, [], [], ["id"])
    flist = getFieldFromRelationship(orgdict, "from_link", "rel")
    olist = getFieldFromRelationship(orgdict, "id", "to")

    fids = []
    for ii in range(len(olist)):
        if olist[ii] in otus:
            fids.append(flist[ii])
    return fids
Ejemplo n.º 2
0
def filterFidsByOtusOptimized(featureIdList, rolesToFids, otuRepsToMembers, config):
    ''' Filter feature IDs by OTU (optimized version).

        To minimize the amount of redundancy in the list of target proteins, filter
        the feature IDs so there is at most one protein from each OTU for each
        functional role.

        @param featureIdList List of unfiltered feature IDs
        @param rolesToFids Dictionary keyed by role of list of feature IDs
        @param otuRepsToMembers Dictionary keyed by OTU representative to list of OTU members
        @param config Dictionary of configuration variables
        @return Dictionary keyed by feature ID of list of roles, dictionary keyed by role
            of list of feature IDs
    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each feature ID.
    # If this fails to find an organism we don't want it anyway...
    fidToOrganism = dict() # Map feature IDs to organisms

     # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 100000
    end = start + increment
    counter = len(featureIdList)
    while counter > 0:
        try:
            ownedBy = cdmi_entity.get_relationship_IsOwnedBy(featureIdList[start:end], [], ['from_link'], ['id'])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        # just build the dictionary here, run the list of ob, extracting fid from from_link and organism from id
        fidList = getFieldFromRelationship(ownedBy, "from_link", "rel")
        organismList = getFieldFromRelationship(ownedBy, "id", "to")
        for index in range(len(fidList)):
            fidToOrganism[fidList[index]] = organismList[index]

        start += increment
        end += increment
        if end >= len(featureIdList):
            end = len(featureIdList)
        counter -= increment

    # Add all possible keys to the dictionaries and initialize the value.
    # Then we don't have to check if the key exists in the main loop below.
    keptFidsToRoles = dict()
    for index in range(len(featureIdList)):
        keptFidsToRoles[featureIdList[index]] = list()
    keptRolesToFids = dict()
    for role in rolesToFids:
        keptRolesToFids[role] = list()

    # Find the feature ID (protein) from each OTU for each functional role.
    otuCounter = 0
    for otuRepresentative in otuRepsToMembers:
        # This loop takes a very long time so print a message every so often
        # to track progress.
        otuCounter += 1
        if otuCounter % 10 == 0:
            sys.stderr.write('Processed %d OTUs at %s\n' %(otuCounter, now()))

        # Check every functional role.
        for role in rolesToFids:
            keepFid = None
            keepRole = None
            for fid in rolesToFids[role]:
                # This can happen due to MOL issues
                if fid not in fidToOrganism:
                    continue
                organism = fidToOrganism[fid]

                # If the organism is the representative we keep it and go to the next role
                if organism == otuRepresentative:
                    keepFid = fid
                    keepRole = role
                    break

                # Otherwise look at the rest of the list (note that I just pick one without really paying
                # attention to WHICH one...). We save them in case there are no examples of the role in the
                # representative organism, but continue on anyway.
                if organism in otuRepsToMembers[otuRepresentative]:
                    keepFid = fid
                    keepRole = role

            # Add to the dictionaries if we are keeping the feature ID.
            if keepFid is not None:
                keptFidsToRoles[keepFid].append(keepRole)
                keptRolesToFids[keepRole].append(keepFid)

    # Look for any empty lists and remove them.
    keysToRemove = list()
    for fid in keptFidsToRoles:
        if len(keptFidsToRoles[fid]) == 0:
            keysToRemove.append(fid)
    for key in keysToRemove:
        del keptFidsToRoles[key]
    keysToRemove = list()
    for role in keptRolesToFids:
        if len(keptRolesToFids[role]) == 0:
            keysToRemove.append(role)
    for key in keysToRemove:
        del keptRolesToFids[key]

    return keptFidsToRoles, keptRolesToFids
Ejemplo n.º 3
0
def filterFidsByOtusBetter(fidsToRoles, rolesToFids, oturepsToMembers, config):
    '''Attempt to do a more intelligent filtering of FIDs by OTU.

    Given all FIDs attached to a role in the unfiltered set we do the following:
    
    Initialize KEEP
    For each OTU and each role:
       If role is found in the representative, add to KEEP and continue;
       Otherwise, iterate over other genomes.
           If role is found in one other genome, add to KEEP and continue;

    This process should make our calculation less sensitive to the choice of OTUs...

    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each fid
    # If this fails to find an organism we don't want it anyway...
    fidlist = fidsToRoles.keys()
    orgdict = []
     # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 5000
    end = start + increment
    counter = len(fidlist)
    while counter > 0:
        try:
            od = cdmi_entity.get_relationship_IsOwnedBy(fidlist[start:end], [], [], ["id"])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        orgdict.extend(od)
        start += increment
        end += increment
        if end >= len(fidlist):
            end = len(fidlist)
        counter -= increment
    fidlist = getFieldFromRelationship(orgdict, "from_link", "rel")
    orglist = getFieldFromRelationship(orgdict, "id", "to")
    fidToOrg = {}
    for ii in range(len(fidlist)):
        fidToOrg[fidlist[ii]] = orglist[ii]
    
    keptFidsToRoles = {}
    keptRolesToFids = {}
    # If the OTUs are comprehensive this should be empty.
    missingRoles = []

    # For each OTU
    for oturep in oturepsToMembers:
        # for each role
        for role in rolesToFids:
            fidlist = rolesToFids[role]
            keepFid = None
            keepRole = None
            for fid in fidlist:
                # This can happen due to MOL issues
                if fid not in fidToOrg:
                    continue
                org = fidToOrg[fid]
                # If the organism is the representative we keep it and go to the next role
                if org == oturep:
                    keepFid = fid
                    keepRole = role
                    break
                # Otherwise look at the rest of the list (note that I just pick one without really paying
                # attention to WHICH one...). We save them in case there are no examples of the role in the
                # representative organism, but continue on anyway.
                if org in oturepsToMembers[oturep]:
                    keepFid = fid
                    keepRole = role
            if keepFid is not None:
                if keepFid in keptFidsToRoles:
                    keptFidsToRoles[keepFid].append(keepRole)
                else:
                    keptFidsToRoles[keepFid] = [ keepRole ]
                if keepRole in keptRolesToFids:
                    keptRolesToFids[keepRole].append(keepFid)
                else:
                    keptRolesToFids[keepRole] = [ keepFid ]

    missingRoles = list(set(rolesToFids.keys()) - set(keptRolesToFids.keys()))

#    print oturepsToMembers
#    print missingRoles
#    print keptRolesToFids

    return keptFidsToRoles, keptRolesToFids, missingRoles