def getGeneSequencesFromFastas(pangenome, fasta_file): fastaDict = {} for line in read_compressed_or_not(fasta_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: logging.getLogger().error( "No tabulation separator found in organisms file") exit(1) org = pangenome.addOrganism(elements[0]) with read_compressed_or_not(elements[1]) as currFastaFile: fastaDict[org] = read_fasta(org, currFastaFile) if not set(pangenome.organisms) <= set(fastaDict.keys()): missing = len(pangenome.organisms) - len( set(pangenome.organisms) & set(fastaDict.keys())) raise Exception( f"Not all of your pangenome's organisms are present within the provided fasta file. {missing} are missing (out of {len(pangenome.organisms)})." ) for org in pangenome.organisms: try: for contig in org.contigs: for gene in contig.genes: gene.add_dna( get_dna_sequence(fastaDict[org][contig.name], gene)) for rna in contig.RNAs: rna.add_dna( get_dna_sequence(fastaDict[org][contig.name], gene)) except KeyError: msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} that was read from the annotation file. " msg += f"The provided contigs in the fasta were : { ', '.join([contig for contig in fastaDict[org].keys()])}." raise KeyError(msg) pangenome.status["geneSequences"] = "Computed"
def read_org_gff(organism, gff_file_path, circular_contigs, getSeq, pseudo=False): (GFF_seqname, _, GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range( 0, 9) #missing values : source, score, frame. They are unused. def getGffAttributes(gff_fields): """ Parses the gff attribute's line and outputs the attributes in a dict structure. :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. :type list: :return: attributes: :rtype: dict """ attributes_field = [ f for f in gff_fields[GFF_attribute].strip().split(';') if len(f) > 0 ] attributes = {} for att in attributes_field: try: (key, value) = att.strip().split('=') attributes[key.upper()] = value except ValueError: pass #we assume that it is a strange, but useless field for our analysis return attributes def getIDAttribute(attributes): """ Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found. :param attribute: :type dict: :return: ElementID: :rtype: string """ ElementID = attributes.get("ID") if not ElementID: logging.getLogger().error( "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: " + gff_file_path) exit(1) return ElementID hasFasta = False fastaString = "" org = Organism(organism) geneCounter = 0 rnaCounter = 0 with read_compressed_or_not(gff_file_path) as gff_file: for line in gff_file: if hasFasta: fastaString += line continue elif line.startswith('##', 0, 2): if line.startswith('FASTA', 2, 7): if not getSeq: #if getting the sequences is useless... break hasFasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] contig = org.getOrAddContig( fields[1], True if fields[1] in circular_contigs else False) continue elif line.startswith( '#!', 0, 2 ): ## special refseq comment lines for versionning softs, assemblies and annotations. continue gff_fields = [el.strip() for el in line.split('\t')] attributes = getGffAttributes(gff_fields) pseudogene = False if gff_fields[GFF_type] == 'region': if gff_fields[GFF_seqname] in circular_contigs: contig.is_circular = True elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]: geneID = attributes.get( "PROTEIN_ID" ) #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that. if geneID is None: #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file) geneID = getIDAttribute(attributes) try: name = attributes.pop('NAME') except KeyError: try: name = attributes.pop('GENE') except KeyError: name = "" if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True try: product = attributes.pop('PRODUCT') except KeyError: product = "" try: genetic_code = attributes.pop("TRANSL_TABLE") except KeyError: genetic_code = "11" if contig.name != gff_fields[GFF_seqname]: contig = org.getOrAddContig( gff_fields[GFF_seqname]) #get the current contig if gff_fields[GFF_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4)) #here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], position=len(contig.genes), name=name, product=product, genetic_code=genetic_code, local_identifier=geneID) gene.fill_parents(org, contig) contig.addGene(gene) geneCounter += 1 elif "RNA" in gff_fields[GFF_type]: rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4)) rna.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], name=name, product=product, local_identifier=geneID) rna.fill_parents(org, contig) contig.addRNA(rna) rnaCounter += 1 ### GET THE FASTA SEQUENCES OF THE GENES if hasFasta and fastaString != "": contigSequences = read_fasta(org, fastaString.split('\n')) for contig in org.contigs: for gene in contig.genes: gene.add_dna( get_dna_sequence(contigSequences[contig.name], gene)) for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contigSequences[contig.name], rna)) return org, hasFasta