def create_gene(org, contig, geneCounter, rnaCounter, ID, dbxref, start, stop, strand, gene_type, position = None, gene_name = "", product = "", genetic_code = 11, protein_id = ""): if any('MaGe' in dbref for dbref in dbxref): if gene_name == "": gene_name = ID for val in dbxref: if 'MaGe' in val: ID = val.split(':')[1] break if gene_type == "CDS": if ID == "": ID = protein_id#on rare occasions, there are no 'locus_tag' from downloaded .gbk file. So we use the protein_id field instead. (which is not supposed to be unique, but was when cases like this were encountered) newGene = Gene(org.name + "_CDS_"+ str(geneCounter).zfill(4)) newGene.fill_annotations(start = start, stop = stop, strand = strand, geneType = gene_type, position = position, name = gene_name, product = product, genetic_code = genetic_code, local_identifier = ID) contig.addGene(newGene) else:# if not CDS, it is a RNA newGene = RNA(org.name + "_RNA_"+ str(rnaCounter).zfill(4)) newGene.fill_annotations(start = start, stop = stop, strand = strand, geneType = gene_type, name = gene_name, product = product) contig.addRNA(newGene) newGene.fill_parents(org, contig)
def create_gene(org, contig, ID, dbxref, start, stop, strand, gene_type, position = None, gene_name = "", product = "", genetic_code = 11): if any('MaGe' in dbref for dbref in dbxref): if gene_name == "": gene_name = ID for val in dbxref: if 'MaGe' in val: ID = val.split(':')[1] break if gene_type == "CDS": newGene = Gene(ID) newGene.fill_annotations(start = start, stop = stop, strand = strand, geneType = gene_type, position = position, name = gene_name, product = product, genetic_code = genetic_code) contig.addGene(newGene) else:#either a CDS, or a RNA newGene = RNA(ID) newGene.fill_annotations(start = start, stop = stop, strand = strand, geneType = gene_type, name = gene_name, product = product) contig.addRNA(newGene) newGene.fill_parents(org, contig)
def l_genes(o_org, o_contig): """ creates a small gene set for testing. returns a list of 4 genes that belongs to the same contig and the same organism.""" l_genes = [] c = 10 for i, gene_id in enumerate([ "toto", "tata", "titi", "tutu", "lolo", "lala", "lili", "lulu", ]): gene = Gene(gene_id) gene.fill_annotations(c, c + 30, "+", position=i) gene.fill_parents(o_org, o_contig) o_contig.addGene(gene) gene.family = GeneFamily(i, gene_id) gene.family.addPartition("c-cloud") l_genes.append(gene) c += 35 return l_genes
def test_mkBitArray_with_org(o_family): organism = "organism" o_gene = Gene(33) o_gene.fill_parents(organism, None) o_family.addGene(o_gene) for i in 1,3,7,12: index = { organism: i } o_family.mkBitarray(index) assert o_family.bitarray == 1<<i
def _make_gene_pair(org, gene_id1, gene_id2): """create a pair of genes that belong to the same organism.""" lo_genes = [] for k in gene_id1, gene_id2: o_gene = Gene(k) o_gene.fill_parents(org,None) lo_genes.append(o_gene) o_family = GeneFamily(k,k) o_family.addGene(o_gene) return tuple(lo_genes)
def l_glist(o_org, o_contig): """ creates a small testing context, with 4 CDS, 1 RNA that are all on the same contig in the same organism""" l_glist = [] c = 10 for i, gene_id in enumerate(["toto", "tata", "titi", "tutu"]): gene = Gene(gene_id) gene.fill_annotations(c, c + 30, "+", position=i) gene.fill_parents(o_org, o_contig) o_contig.addGene(gene) gene.family = gene_id l_glist.append(gene) c += 35 return l_glist
def _make_gene_pair(org, gene_id1, gene_id2): """create 2 genes from org. each gene belong to its own family.""" lo_genes = [] for k in gene_id1, gene_id2: o_gene = Gene(k) o_gene.fill_parents(org, None) lo_genes.append(o_gene) o_family = GeneFamily(k, k) o_family.addGene(o_gene) return tuple(lo_genes)
def test_cstr(): o_src = Gene('source') o_tgt = Gene('target') # set organism and contig to None. o_src.fill_parents(None, None) o_tgt.fill_parents(None, None) # define the None GeneFamily, and add the 2 genes to it. o_family = GeneFamily(None, None) o_family.addGene(o_src) o_family.addGene(o_tgt) o_edge = Edge(o_src, o_tgt) assert isinstance(o_edge, Edge) assert o_edge.source == o_src.family assert o_edge.target == o_tgt.family assert dict(o_edge.organisms) == {None: [(o_src, o_tgt)]}
def test_cstr_error(): o_src = Gene('source') o_tgt = Gene('target') # genes should have a family with pytest.raises(Exception): o_edge = Edge(o_src, o_tgt) o_family = GeneFamily(None, None) o_family.addGene(o_src) # both genes sould have a family with pytest.raises(Exception): o_edge = Edge(o_src, o_tgt) # gene should belong to the same organism o_family.addGene(o_tgt) o_src.fill_parents("", None) o_tgt.fill_parents(None, None) with pytest.raises(Exception): o_edge = Edge(o_src, o_tgt)
def filled_families(): """ return a list of families and genes. there will be between 3 and 10 genes/families. Each family has only one gene. """ lo_genes = [] lo_fam = [] n_families = randint(3, 10) for fam in range(n_families): o_gene = Gene(fam) o_gene.fill_parents(None, None) o_family = GeneFamily(fam,fam) o_family.addGene(o_gene) lo_genes.append(o_gene) lo_fam.append(o_family) return lo_fam, lo_genes
def readOrganism(pangenome, orgName, contigDict, circularContigs, link=False): org = Organism(orgName) for contigName, geneList in contigDict.items(): contig = org.getOrAddContig(contigName, is_circular=circularContigs[contigName]) for row in geneList: if link: #if the gene families are already computed/loaded the gene exists. gene = pangenome.getGene(row["ID"].decode()) else: #else creating the gene. gene_type = row["type"].decode() if gene_type == "CDS": gene = Gene(row["ID"].decode()) elif "RNA" in gene_type: gene = RNA(row["ID"].decode()) try: local = row["local"].decode() except ValueError: local = "" gene.fill_annotations(start=row["start"], stop=row["stop"], strand=row["strand"].decode(), geneType=row["type"].decode(), position=row["position"], genetic_code=row["genetic_code"], name=row["name"].decode(), product=row["product"].decode(), local_identifier=local) gene.is_fragment = row["is_fragment"] gene.fill_parents(org, contig) if gene_type == "CDS": contig.addGene(gene) elif "RNA" in gene_type: contig.addRNA(gene) else: raise Exception( f"A strange type '{gene_type}', which we do not know what to do with, was met." ) pangenome.addOrganism(org)
def read_org_gff(organism, gff_file_path, circular_contigs, getSeq, pseudo=False): (GFF_seqname, _, GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range( 0, 9) #missing values : source, score, frame. They are unused. def getGffAttributes(gff_fields): """ Parses the gff attribute's line and outputs the attributes in a dict structure. :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. :type list: :return: attributes: :rtype: dict """ attributes_field = [ f for f in gff_fields[GFF_attribute].strip().split(';') if len(f) > 0 ] attributes = {} for att in attributes_field: try: (key, value) = att.strip().split('=') attributes[key.upper()] = value except ValueError: pass #we assume that it is a strange, but useless field for our analysis return attributes def getIDAttribute(attributes): """ Gets the ID of the element from which the provided attributes were extracted. Raises an error if no ID is found. :param attribute: :type dict: :return: ElementID: :rtype: string """ ElementID = attributes.get("ID") if not ElementID: logging.getLogger().error( "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: " + gff_file_path) exit(1) return ElementID hasFasta = False fastaString = "" org = Organism(organism) geneCounter = 0 rnaCounter = 0 with read_compressed_or_not(gff_file_path) as gff_file: for line in gff_file: if hasFasta: fastaString += line continue elif line.startswith('##', 0, 2): if line.startswith('FASTA', 2, 7): if not getSeq: #if getting the sequences is useless... break hasFasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] contig = org.getOrAddContig( fields[1], True if fields[1] in circular_contigs else False) continue elif line.startswith( '#!', 0, 2 ): ## special refseq comment lines for versionning softs, assemblies and annotations. continue gff_fields = [el.strip() for el in line.split('\t')] attributes = getGffAttributes(gff_fields) pseudogene = False if gff_fields[GFF_type] == 'region': if gff_fields[GFF_seqname] in circular_contigs: contig.is_circular = True elif gff_fields[GFF_type] == 'CDS' or "RNA" in gff_fields[GFF_type]: geneID = attributes.get( "PROTEIN_ID" ) #if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that. if geneID is None: #if its not found, we get the one under the 'ID' field which must exist (otherwise not a gff3 compliant file) geneID = getIDAttribute(attributes) try: name = attributes.pop('NAME') except KeyError: try: name = attributes.pop('GENE') except KeyError: name = "" if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True try: product = attributes.pop('PRODUCT') except KeyError: product = "" try: genetic_code = attributes.pop("TRANSL_TABLE") except KeyError: genetic_code = "11" if contig.name != gff_fields[GFF_seqname]: contig = org.getOrAddContig( gff_fields[GFF_seqname]) #get the current contig if gff_fields[GFF_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(geneCounter).zfill(4)) #here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], position=len(contig.genes), name=name, product=product, genetic_code=genetic_code, local_identifier=geneID) gene.fill_parents(org, contig) contig.addGene(gene) geneCounter += 1 elif "RNA" in gff_fields[GFF_type]: rna = RNA(org.name + "_CDS_" + str(rnaCounter).zfill(4)) rna.fill_annotations(start=int(gff_fields[GFF_start]), stop=int(gff_fields[GFF_end]), strand=gff_fields[GFF_strand], geneType=gff_fields[GFF_type], name=name, product=product, local_identifier=geneID) rna.fill_parents(org, contig) contig.addRNA(rna) rnaCounter += 1 ### GET THE FASTA SEQUENCES OF THE GENES if hasFasta and fastaString != "": contigSequences = read_fasta(org, fastaString.split('\n')) for contig in org.contigs: for gene in contig.genes: gene.add_dna( get_dna_sequence(contigSequences[contig.name], gene)) for rna in contig.RNAs: rna.add_dna(get_dna_sequence(contigSequences[contig.name], rna)) return org, hasFasta