def writeStatusFile(self, status):
     ''' Write new status value to the status file.
     
         @param status New status value
         @return Nothing
     '''
 
     fid = open(self.StatusFiles['status_file'], 'w')
     fid.write("%s\nupdated at %s\n" %(status, now()))
     fid.close()
     return
    def writeStatusFile(self, status):
        ''' Write new status value to the status file.
        
            @param status New status value
            @return Nothing
        '''

        fid = open(self.StatusFiles['status_file'], 'w')
        fid.write("%s\nupdated at %s\n" % (status, now()))
        fid.close()
        return
Esempio n. 3
0
def filterFidsByOtusOptimized(featureIdList, rolesToFids, otuRepsToMembers, config):
    ''' Filter feature IDs by OTU (optimized version).

        To minimize the amount of redundancy in the list of target proteins, filter
        the feature IDs so there is at most one protein from each OTU for each
        functional role.

        @param featureIdList List of unfiltered feature IDs
        @param rolesToFids Dictionary keyed by role of list of feature IDs
        @param otuRepsToMembers Dictionary keyed by OTU representative to list of OTU members
        @param config Dictionary of configuration variables
        @return Dictionary keyed by feature ID of list of roles, dictionary keyed by role
            of list of feature IDs
    '''

    cdmi_entity = CDMI_EntityAPI(config["cdmi_url"])

    # Identify the organism belonging to each feature ID.
    # If this fails to find an organism we don't want it anyway...
    fidToOrganism = dict() # Map feature IDs to organisms

     # Break the complete list into smaller sub-lists to avoid timeouts
    start = 0
    increment = 100000
    end = start + increment
    counter = len(featureIdList)
    while counter > 0:
        try:
            ownedBy = cdmi_entity.get_relationship_IsOwnedBy(featureIdList[start:end], [], ['from_link'], ['id'])
        except HTTPError as e:
            if increment > 1:
                increment = increment / 2
                end = start + increment
            sys.stderr.write("caught '%s' error, increment is now %d\n" %(e.reason, increment))
            continue
        # just build the dictionary here, run the list of ob, extracting fid from from_link and organism from id
        fidList = getFieldFromRelationship(ownedBy, "from_link", "rel")
        organismList = getFieldFromRelationship(ownedBy, "id", "to")
        for index in range(len(fidList)):
            fidToOrganism[fidList[index]] = organismList[index]

        start += increment
        end += increment
        if end >= len(featureIdList):
            end = len(featureIdList)
        counter -= increment

    # Add all possible keys to the dictionaries and initialize the value.
    # Then we don't have to check if the key exists in the main loop below.
    keptFidsToRoles = dict()
    for index in range(len(featureIdList)):
        keptFidsToRoles[featureIdList[index]] = list()
    keptRolesToFids = dict()
    for role in rolesToFids:
        keptRolesToFids[role] = list()

    # Find the feature ID (protein) from each OTU for each functional role.
    otuCounter = 0
    for otuRepresentative in otuRepsToMembers:
        # This loop takes a very long time so print a message every so often
        # to track progress.
        otuCounter += 1
        if otuCounter % 10 == 0:
            sys.stderr.write('Processed %d OTUs at %s\n' %(otuCounter, now()))

        # Check every functional role.
        for role in rolesToFids:
            keepFid = None
            keepRole = None
            for fid in rolesToFids[role]:
                # This can happen due to MOL issues
                if fid not in fidToOrganism:
                    continue
                organism = fidToOrganism[fid]

                # If the organism is the representative we keep it and go to the next role
                if organism == otuRepresentative:
                    keepFid = fid
                    keepRole = role
                    break

                # Otherwise look at the rest of the list (note that I just pick one without really paying
                # attention to WHICH one...). We save them in case there are no examples of the role in the
                # representative organism, but continue on anyway.
                if organism in otuRepsToMembers[otuRepresentative]:
                    keepFid = fid
                    keepRole = role

            # Add to the dictionaries if we are keeping the feature ID.
            if keepFid is not None:
                keptFidsToRoles[keepFid].append(keepRole)
                keptRolesToFids[keepRole].append(keepFid)

    # Look for any empty lists and remove them.
    keysToRemove = list()
    for fid in keptFidsToRoles:
        if len(keptFidsToRoles[fid]) == 0:
            keysToRemove.append(fid)
    for key in keysToRemove:
        del keptFidsToRoles[key]
    keysToRemove = list()
    for role in keptRolesToFids:
        if len(keptRolesToFids[role]) == 0:
            keysToRemove.append(role)
    for key in keysToRemove:
        del keptRolesToFids[key]

    return keptFidsToRoles, keptRolesToFids
def generate_data(dataParser, config, force):
    
    # When regenerating the database files, remove all of them first.
    if force:
        sys.stderr.write("Removing all static database files...")
        for filename in dataParser.DataFiles.values():
            safeRemove(filename)
        sys.stderr.write("done\n")
    
    sys.stderr.write("Generating static database files in '%s'...\n" %(config["data_folder_path"]))
    sys.stderr.write("Central data model server is at %s\n\n" %(config['cdmi_url']))
    
    # Get list of representative OTU genome IDs.
    sys.stderr.write("Getting list of representative OTU genome IDs at %s\n" %(now()))
    sys.stderr.write("Saving list to file '%s'\nDownloading from cdmi server...\n" %(dataParser.DataFiles['otu_id_file']))
    otus, prokotus = getOtuGenomeIds(1000, config) # Data build V2 size is 1274
    dataParser.writeOtuData(otus, prokotus)
    sys.stderr.write("Found %d OTU genome IDs of which %d are from prokaryotes\nDone at %s\n\n" %(len(otus), len(prokotus), now()))
    del otus, prokotus
    
    # Get a list of subsystem feature IDs (FIDs).
    # Functional annotations from the SEED subsystems are manually curated from
    # multiple sources of information.
    sys.stderr.write("Getting list of subsystem feature IDs at %s\n" %(now()))
    sys.stderr.write("Saving list to file '%s'\nDownloading from cdmi server...\n" %(dataParser.DataFiles['subsystem_fid_file']))
    subsysFids = subsystemFids(1000, config) # Data build V2 size is 2057
    dataParser.writeSubsystemFids(subsysFids)
    sys.stderr.write("Found %d subsystem feature IDs\nDone at %s\n\n" %(len(subsysFids), now()))
    
    # Get a list of direct literature-supported feature IDs.
    # We include these because having them greatly expands the
    # number of roles for which we have representatives.
    sys.stderr.write("Getting list of direct literature-supported feature IDs at %s\n" %(now()))
    sys.stderr.write("Saving list to file '%s'\nDownloading from cdmi server...\n" %(dataParser.DataFiles['dlit_fid_file']))
    literatureFids = getDlitFids(5000, config) # Data build V2 size is 12469
    dataParser.writeDlitFids(literatureFids)
    sys.stderr.write("Found %d literature-supported feature IDs\nDone at %s\n\n" %(len(literatureFids), now()))
    
    # Concatenate the two feature ID lists before filtering.
    # (Note - doing so after would be possible as well but
    # can lead to the same kinds of biases as not filtering
    # the subsystems... I'm not sure the problem would
    # be as bad for these though)
    sys.stderr.write("Merging lists of subsystem and literature feature IDs at %s\n" %(now()))
    sys.stderr.write("Saving list to file '%s'\nGenerating file...\n" %(dataParser.DataFiles['concatenated_fid_file']))
    allFids = list(set(subsysFids + literatureFids))
    dataParser.writeAllFids(allFids)
    sys.stderr.write("Stored %d feature IDs in combined list\nDone at %s\n\n" %(len(allFids), now()))
    del subsysFids, literatureFids
    
    # Identify a role for each feature ID in the concatenated list.
    sys.stderr.write("Getting roles for all feature IDs at %s\n" %(now()))
    sys.stderr.write("Saving mapping of feature ID to roles to file '%s'\nDownloading from cdmi server...\n" %(dataParser.DataFiles['concatenated_fid_role_file']))
    allFidsToRoles, allRolesToFids = fidsToRoles(allFids, config)
    dataParser.writeAllFidRoles(allFidsToRoles)
    sys.stderr.write("Stored %d feature ID to roles mappings\nDone at %s\n\n" %(len(allFidsToRoles), now()))
    del allFidsToRoles
    
    # Get a mapping of OTU representative genome IDs to all genomes in the OTU.
    sys.stderr.write("Getting mapping of OTU representative genome IDs to all genomes in OTU at %s\n" %(now()))
    sys.stderr.write("Downloading from cdmi server...\n")
    otuGenomes = getOtuGenomeDictionary(1000, config) # Data build V2 size is 1274
    sys.stderr.write("Found %d representative OTU genome IDs\nDone at %s\n\n" %(len(otuGenomes), now()))
    
    # Filter the feature IDs by organism. We only want one feature ID from each OTU for each
    # functional role.  Unlike the neighborhood analysis, we don't want to include only 
    # prokaryotes here.
    sys.stderr.write("Filtering list of feature IDs so there is one protein from each OTU for each functional role at %s\n" %(now()))
    sys.stderr.write("Saving list of filtered feature IDs in file '%s'\nQuerying cdmi server...\n" %(dataParser.DataFiles['subsystem_otu_fid_roles_file']))
    otuFidsToRoles, otuRolesToFids = filterFidsByOtusOptimized(allFids, allRolesToFids, otuGenomes, config)
    dataParser.writeFilteredOtuRoles(otuFidsToRoles)
    sys.stderr.write("Stored %d feature ID to role mappings\nDone at %s\n\n" %(len(otuFidsToRoles), now()))
    del allFids, otuRolesToFids, otuGenomes
    
    # Generate a FASTA file for the feature IDs in filtered list and make a BLAST database.
    sys.stderr.write("Getting amino acid sequences for filtered feature IDs at %s\n" %(now()))
    sys.stderr.write("Downloading from cdmi server...\n")
    fidsToSeqs = fidsToSequences(otuFidsToRoles.keys(), config)
    sys.stderr.write("Writing amino acid sequences to FASTA file '%s'\nGenerating file and making search database...\n" %(dataParser.DataFiles['subsystem_otu_fasta_file']))
    dataParser.writeSubsystemFasta(fidsToSeqs)
    dataParser.buildSearchDatabase()
    sys.stderr.write("Done at %s\n\n" %(now()))
    del otuFidsToRoles, fidsToSeqs
    
    # Create a mapping of complexes to roles which is needed to go from annotation likelihoods to
    # reaction likelihoods.  Note that it is easier to go in this direction because we need all
    # the roles in a complex to get the probability of that complex.
    sys.stderr.write("Getting mapping of complex to roles at %s\n" %(now()))
    sys.stderr.write("Saving complex to roles mapping in file '%s'\nDownloading from cdmi server...\n" %(dataParser.DataFiles['complexes_roles_file']))
    complexToRequiredRoles, requiredRolesToComplexes = complexRoleLinks(1000, config) # Data build V2 size is 2369
    dataParser.writeComplexRoles(complexToRequiredRoles)
    sys.stderr.write("Stored %d complex to roles mappings\nDone at %s\n\n" %(len(complexToRequiredRoles), now()))
    del complexToRequiredRoles, requiredRolesToComplexes
    
    # Create a mapping of reactions to complexes.  Note that it is easier to go in this direction since
    # we'll be filtering multiple complexes down to a single reaction.
    sys.stderr.write("Getting mapping of reaction to complexes at %s\n" %(now()))
    sys.stderr.write("Saving reaction to complexes mapping in file '%s'\nDownloading from cdmi server...\n" %(dataParser.DataFiles['reaction_complexes_file']))
    reactionToComplexes, complexesToReactions = reactionComplexLinks(5000, config) # Data build V2 size is 33733
    dataParser.writeReactionComplex(reactionToComplexes)
    sys.stderr.write("Stored %d reaction to complexes mappings\nDone at %s\n\n" %(len(reactionToComplexes), now()))
    del reactionToComplexes, complexesToReactions
    
    sys.stderr.write("Done generating static database files\n")
    return