def addHeadersAndFastaName(self, fasta, gcf = None, acc = None): """Create for hack update to v2 format. Will update entries in genome collection to add fasta complete headers and fasta name. :param fasta: Path to fasta file :type fasta: str :param gcf: gcf assembly accession :type gcf: str :param acc: accession number :type acc: str """ fasta_name = fasta.split("/")[-1] try : fasta_md5 = fastaHash(fasta) except FileNotFoundError: logging.error(f"Can't add your entry because fasta file is not found.") return try: genome_entity = self.genomedb.get(fasta_md5, gcf, acc) except error.DuplicateError as e: logging.error(f"Can't add your entry because DuplicateError in genome database \nReason : \n{e}") return except error.ConsistencyError as e: logging.error(f"Can't add your entry because ConsistencyError in genome database \nReason : \n{e}") return
def addBlast(self, fastaList): for zFasta in fastaList: fasta_md5 = fastaHash(zFasta) genomElem = self.genomedb.get(fasta_md5) for header, seq, _id in zFastaReader(zFasta): _header = f">{genomElem._id}|{header.replace(r'/^>//', '')}" self.blastdb.add(_header, seq) self.blastdb.close()
def removeFromBlast(self, fastaList: List[str]): """Remove entries from blast database from fasta files :param fastaList: List of paths to fasta files :type fastaList: List[str] """ logging.info("Remove from Blast database") self.blastdb.set_remove_mode(True) for zFasta in fastaList: fasta_md5 = fastaHash(zFasta) genomElem = self.genomedb.get(fasta_md5) if not genomElem: logging.error(f"{zFasta} is not stored in genome database") return for header, seq, _id in zFastaReader(zFasta): _header = f">{genomElem._id}|{header.replace(r'/^>//', '')}" self.blastdb.remove(_header, seq) self.blastdb.close()
def addFastaMotif(self, fastaFile, batchSize): fasta_md5 = fastaHash(fastaFile) genomeEntity = self.getGenomeEntity(fasta_md5) if not genomeEntity: raise error.NoGenomeEntity(fastaFile) uuid = genomeEntity._id sgRNA_data = sgRNAfastaSearch(fastaFile, uuid) allKeys = list(sgRNA_data.keys()) if not self.wrapper.hasKeyMappingRules: logging.warn(f"databaseManager::addFastaMotif:Without mapping rules, {len(allKeys)} computed sgRNA motifs will not be inserted into database") return (sgRNA_data, uuid, None) logging.info(f"databaseManager::addFastaMotif:Slicing \"{uuid}\" to volDocAdd its {len(allKeys)} genomic sgRNA motif") for i in range(0,len(allKeys), batchSize): j = i + batchSize if i + batchSize < len(allKeys) else len(allKeys) legit_keys = allKeys[i:j] d = { k : sgRNA_data[k] for k in legit_keys } logging.info(f"databaseManager::addFastaMotif:Attempting to volDocAdd {len(d.keys())} sets sgRNA keys") r = self.wrapper.volDocAdd(d) return (sgRNA_data, uuid, r)
def removeGenomeFromGenomeAndTaxon(self, fasta: str, name: str, taxid: int = None, gcf: str = None, acc: str = None): """Remove a genome entry from genome and taxon collection. If the corresponding taxon contain only this genome, it is removed too. If not, it's just updated. :param fasta: Path to fasta file :type fasta: str :param name: Name of the taxon :type name: str :param taxid: NCBI taxid, defaults to None :type taxid: int, optional :param gcf: Gcf assembly accession number, defaults to None :type gcf: str, optional :param acc: Accession number, defaults to None :type acc: str, optional :raises error.ConsistencyError: Raise if their is problem of consistency between genome and taxon collection :return: Deleted genome id :rtype: str """ logging.info(f"= Remove genome\nfasta: {fasta}\n name : {name}\n taxid : {taxid}\n gcf : {gcf}\n acc : {acc}") try : fasta_md5 = fastaHash(fasta) except FileNotFoundError: logging.error(f"Can't remove your entry because fasta file is not found.") return try: genome = self.genomedb.get(fasta_md5, gcf, acc) except error.DuplicateError as e: logging.error(f"Can't remove your entry because of DuplicateError\nreason: {e}") return except error.ConsistencyError as e : logging.error(f"Can't remove your entry because of ConsistencyError\nreason: {e}") return if not genome: logging.error(f"Genome doesn't exist in genome database") return logging.info(f"Genome : {genome._id}") logging.info(f"Corresponding taxon is {genome.taxon}") taxon = self.taxondb.getFromID(genome.taxon) if not taxon: raise error.ConsistencyError(f"Associated taxon {genome.taxon} doesn't exist in taxon database") if taxon.taxid != taxid : raise error.ConsistencyError(f"Database taxon taxid {taxon.taxid} doesn't correspond to your taxid {taxid}") if taxon.name != name : raise error.ConsistencyError(f"Database taxon name {taxon.name} doesn't correspond to your name {name}") if not genome._id in taxon.genomeColl: raise error.ConsistencyError(f"Genome {genome._id} is not linked with its taxon {taxon._id}") taxon.genomeColl.remove(genome._id) if not taxon.genomeColl: logging.info(f"Your genome was the only version of the taxon (name : {taxon.name}, taxid : {taxon.taxid}). Taxon will be deleted.") genome.remove() taxon.remove() else: logging.info(f"Delete this version of Taxon (name : {taxon.name}, taxid : {taxon.taxid}). Current version become the previous one.") taxon.current = taxon.genomeColl[-1] genome.remove() taxon.store() return genome._id
def addGenome(self, fasta: str, name: str, taxid: int = None, gcf: str = None, acc: str = None): """ Take informations about genome and corresponding taxon and insert them in databases. :param fasta: Path to fasta file :param name: Taxon name :param taxid: Taxid if available :param gcf: GCF accession for assembly if available :param acc: accession number if available :type fasta: path :type name: str :type taxid: int :type gcf: str :type acc: str """ logging.info(f"Add genome\nfasta : {fasta}\nname : {name}\ntaxid : {taxid}\ngcf: {gcf}\nacc: {acc}") fasta_name = fasta.split("/")[-1] try : fasta_md5 = fastaHash(fasta) except FileNotFoundError: logging.error(f"Can't add your entry because fasta file is not found.") return try: genome_entity = self.genomedb.get(fasta_md5, gcf, acc) except error.DuplicateError as e: logging.error(f"Can't add your entry because DuplicateError in genome database \nReason : \n{e}") return except error.ConsistencyError as e: logging.error(f"Can't add your entry because ConsistencyError in genome database \nReason : \n{e}") return #Check if genome_entity contains headers and fasta_name or if it's old version, and update it if necessary. (Temporary hack until all is updated) if genome_entity and (not genome_entity.headers or not genome_entity.fasta_name): #WARN: Duplicated code try: size, headers = self._proceed_fasta(fasta) except error.FastaHeaderConflict as e: logging.error(f"Can't add your entry because FastaHeaderConflict\n{e}") return logging.warn(f"Your genome entry already exists but as old version (no headers and no fasta_name), the entry will be updated") try: genome_entity.update(headers = headers, fasta_name = fasta_name) except error.NotAvailableKeys as e: logging.error(f"Can't update your entry because NotAvailableKeys\n{e}") return if not genome_entity: try: size, headers = self._proceed_fasta(fasta) except error.FastaHeaderConflict as e: logging.error(f"Can't add your entry because FastaHeaderConflict\n{e}") return genome_entity = self.genomedb.createNewGenome(fasta_md5, size, headers, fasta_name, gcf, acc) try: taxon_entity = self.taxondb.get(name, taxid) except error.DuplicateError as e: logging.error(f"Can't add your entry because DuplicateError in taxon database \nReason : \n{e}") return except error.ConsistencyError as e: logging.error(f"Can't add your entry because ConsistencyError in taxon database \nReason : \n{e}") return if not taxon_entity: taxon_entity = self.taxondb.createNewTaxon(name, taxid) try: self.bind(genome_entity, taxon_entity) except error.LinkError as e: logging.error(f"Can't add your entry because LinkError\nReason : \n{e}") return except error.VersionError as e: logging.error(f"Can't add your entry because VersionError\nReason : \n{e}") return genome_return_status = genome_entity.store() taxon_return_status = taxon_entity.store() return genome_return_status