예제 #1
0
    def addHeadersAndFastaName(self, fasta, gcf = None, acc = None):
        """Create for hack update to v2 format. Will update entries in genome collection to add fasta complete headers and fasta name.
        
        :param fasta: Path to fasta file
        :type fasta: str
        :param gcf: gcf assembly accession
        :type gcf: str
        :param acc: accession number
        :type acc: str
        """
        fasta_name = fasta.split("/")[-1]
        try :
            fasta_md5 = fastaHash(fasta)
        except FileNotFoundError:
            logging.error(f"Can't add your entry because fasta file is not found.")
            return       

        try:
            genome_entity = self.genomedb.get(fasta_md5, gcf, acc)
        except error.DuplicateError as e: 
            logging.error(f"Can't add your entry because DuplicateError in genome database \nReason : \n{e}")
            return
        except error.ConsistencyError as e: 
            logging.error(f"Can't add your entry because ConsistencyError in genome database \nReason : \n{e}")
            return
예제 #2
0
 def addBlast(self, fastaList):
     for zFasta in fastaList:
         fasta_md5 = fastaHash(zFasta)
         genomElem = self.genomedb.get(fasta_md5)
         for header, seq, _id  in zFastaReader(zFasta):
             _header = f">{genomElem._id}|{header.replace(r'/^>//', '')}"
             self.blastdb.add(_header, seq)
     self.blastdb.close()
예제 #3
0
 def removeFromBlast(self, fastaList: List[str]):
     """Remove entries from blast database from fasta files
     
     :param fastaList: List of paths to fasta files
     :type fastaList: List[str]
     """
     logging.info("Remove from Blast database")
     self.blastdb.set_remove_mode(True)
     for zFasta in fastaList:
         fasta_md5 = fastaHash(zFasta)
         genomElem = self.genomedb.get(fasta_md5)
         if not genomElem:
             logging.error(f"{zFasta} is not stored in genome database")
             return
         for header, seq, _id  in zFastaReader(zFasta):
             _header = f">{genomElem._id}|{header.replace(r'/^>//', '')}"
             self.blastdb.remove(_header, seq)
     self.blastdb.close()
예제 #4
0
 def addFastaMotif(self, fastaFile, batchSize):   
     fasta_md5 = fastaHash(fastaFile)
     genomeEntity = self.getGenomeEntity(fasta_md5)
     if not genomeEntity:
         raise error.NoGenomeEntity(fastaFile)
     uuid = genomeEntity._id
     sgRNA_data = sgRNAfastaSearch(fastaFile, uuid)
     allKeys = list(sgRNA_data.keys())
     if not self.wrapper.hasKeyMappingRules:
         logging.warn(f"databaseManager::addFastaMotif:Without mapping rules, {len(allKeys)} computed sgRNA motifs will not be inserted into database")
         return (sgRNA_data, uuid, None)
     
     logging.info(f"databaseManager::addFastaMotif:Slicing \"{uuid}\" to volDocAdd its {len(allKeys)} genomic sgRNA motif")       
     for i in range(0,len(allKeys), batchSize):
       
         j = i + batchSize if i + batchSize < len(allKeys) else len(allKeys)
         legit_keys = allKeys[i:j]
         d = { k : sgRNA_data[k] for k in legit_keys }
         logging.info(f"databaseManager::addFastaMotif:Attempting to volDocAdd {len(d.keys())} sets sgRNA keys")
    
         r = self.wrapper.volDocAdd(d)
     return (sgRNA_data, uuid, r)
예제 #5
0
    def removeGenomeFromGenomeAndTaxon(self, fasta: str, name: str, taxid: int = None, gcf: str = None, acc: str = None): 
        """Remove a genome entry from genome and taxon collection. If the corresponding taxon contain only this genome, it is removed too. If not, it's just updated.
        
        :param fasta: Path to fasta file
        :type fasta: str
        :param name: Name of the taxon
        :type name: str
        :param taxid: NCBI taxid, defaults to None
        :type taxid: int, optional
        :param gcf: Gcf assembly accession number, defaults to None
        :type gcf: str, optional
        :param acc: Accession number, defaults to None
        :type acc: str, optional
        :raises error.ConsistencyError: Raise if their is problem of consistency between genome and taxon collection
        :return: Deleted genome id
        :rtype: str
        """
        logging.info(f"= Remove genome\nfasta: {fasta}\n name : {name}\n taxid : {taxid}\n gcf : {gcf}\n acc : {acc}")
        try :
            fasta_md5 = fastaHash(fasta)
        except FileNotFoundError:
            logging.error(f"Can't remove your entry because fasta file is not found.")
            return  

        try: 
            genome = self.genomedb.get(fasta_md5, gcf, acc)
        except error.DuplicateError as e:
            logging.error(f"Can't remove your entry because of DuplicateError\nreason: {e}")
            return
        except error.ConsistencyError as e :
            logging.error(f"Can't remove your entry because of ConsistencyError\nreason: {e}")
            return
        
        if not genome:
            logging.error(f"Genome doesn't exist in genome database")
            return

        logging.info(f"Genome : {genome._id}")
        logging.info(f"Corresponding taxon is {genome.taxon}")

        taxon = self.taxondb.getFromID(genome.taxon)
        if not taxon: 
            raise error.ConsistencyError(f"Associated taxon {genome.taxon} doesn't exist in taxon database")
        
        if taxon.taxid != taxid : 
            raise error.ConsistencyError(f"Database taxon taxid {taxon.taxid} doesn't correspond to your taxid {taxid}")

        if taxon.name != name : 
            raise error.ConsistencyError(f"Database taxon name {taxon.name} doesn't correspond to your name {name}")

        if not genome._id in taxon.genomeColl: 
            raise error.ConsistencyError(f"Genome {genome._id} is not linked with its taxon {taxon._id}")

        taxon.genomeColl.remove(genome._id)

        if not taxon.genomeColl:
            logging.info(f"Your genome was the only version of the taxon (name : {taxon.name}, taxid : {taxon.taxid}). Taxon will be deleted.")
            genome.remove()
            taxon.remove()

        else: 
            logging.info(f"Delete this version of Taxon (name : {taxon.name}, taxid : {taxon.taxid}). Current version become the previous one.")
            taxon.current = taxon.genomeColl[-1]
            genome.remove()
            taxon.store()
        
        return genome._id 
예제 #6
0
    def addGenome(self, fasta: str, name: str, taxid: int = None, gcf: str = None, acc: str = None):
        """
        Take informations about genome and corresponding taxon and insert them in databases.

        :param fasta: Path to fasta file
        :param name: Taxon name
        :param taxid: Taxid if available
        :param gcf: GCF accession for assembly if available
        :param acc: accession number if available

        :type fasta: path
        :type name: str
        :type taxid: int
        :type gcf: str
        :type acc: str
        """
        logging.info(f"Add genome\nfasta : {fasta}\nname : {name}\ntaxid : {taxid}\ngcf: {gcf}\nacc: {acc}")
        fasta_name = fasta.split("/")[-1]
        try :
            fasta_md5 = fastaHash(fasta)
        except FileNotFoundError:
            logging.error(f"Can't add your entry because fasta file is not found.")
            return    

        try:
            genome_entity = self.genomedb.get(fasta_md5, gcf, acc)
        except error.DuplicateError as e: 
            logging.error(f"Can't add your entry because DuplicateError in genome database \nReason : \n{e}")
            return
        except error.ConsistencyError as e: 
            logging.error(f"Can't add your entry because ConsistencyError in genome database \nReason : \n{e}")
            return
        
        #Check if genome_entity contains headers and fasta_name or if it's old version, and update it if necessary. (Temporary hack until all is updated)
        if genome_entity and (not genome_entity.headers or not genome_entity.fasta_name):
            #WARN: Duplicated code
            try:
                size, headers = self._proceed_fasta(fasta)
            except error.FastaHeaderConflict as e:
                logging.error(f"Can't add your entry because FastaHeaderConflict\n{e}")
                return

            logging.warn(f"Your genome entry already exists but as old version (no headers and no fasta_name), the entry will be updated")
            try: 
                genome_entity.update(headers = headers, fasta_name = fasta_name)
            except error.NotAvailableKeys as e: 
                logging.error(f"Can't update your entry because NotAvailableKeys\n{e}")
                return

        if not genome_entity:
            try:
                size, headers = self._proceed_fasta(fasta)
            except error.FastaHeaderConflict as e:
                logging.error(f"Can't add your entry because FastaHeaderConflict\n{e}")
                return
                
            genome_entity = self.genomedb.createNewGenome(fasta_md5, size, headers, fasta_name, gcf, acc)

        try:
            taxon_entity = self.taxondb.get(name, taxid)
        except error.DuplicateError as e:
            logging.error(f"Can't add your entry because DuplicateError in taxon database \nReason : \n{e}")
            return
        except error.ConsistencyError as e:
            logging.error(f"Can't add your entry because ConsistencyError in taxon database \nReason : \n{e}")
            return
        
        if not taxon_entity:
            taxon_entity = self.taxondb.createNewTaxon(name, taxid)
        
        try:
            self.bind(genome_entity, taxon_entity)

        except error.LinkError as e:
            logging.error(f"Can't add your entry because LinkError\nReason : \n{e}")
            return
        
        except error.VersionError as e:
            logging.error(f"Can't add your entry because VersionError\nReason : \n{e}")
            return

        genome_return_status = genome_entity.store()
        taxon_return_status = taxon_entity.store()

        return genome_return_status