def intersectcirc(self, circ_file, modified_gtf_file, strand=True): # imput the result file of print_start_end_file #intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2 circ = pybedtools.BedTool(circ_file) gtf = pybedtools.BedTool(modified_gtf_file) if strand: intersectfile = circ.intersect(gtf, wa=True, wb=True, loj=True, s=True, nonamecheck=True) else: intersectfile = circ.intersect(gtf, wa=True, wb=True, loj=True, nonamecheck=True) # Store circExons as: circle start or end intervals as key, custom_exon_id as value circExons = {} for lin in intersectfile: lin_split = str(lin).split('\t') if lin_split[14].strip('\n') == '.': #lin_split[11] = '' pass else: circExons.setdefault( HTSeq.GenomicInterval(lin_split[0], int(lin_split[1]), int(lin_split[2]), lin_split[5]), set()).add( HTSeq.parse_GFF_attribute_string( lin_split[14])['custom_exon_id']) #circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) }) return circExons
def loadFromFile( cls, org, inputgff="/mnt/c/ownCloud/data/miRExplore/obodir/mm10_primary_assembly_and_lncRNA.gtf" ): ret = GeneNeighbourDB(org) with open(inputgff, 'r') as fin: addedGenes = 0 for line in fin: line = line.strip().split() #print(line) chr = line[0] type = line[2] if type != 'gene': continue attrStr = "\t".join(line[8:]) allAttr = HTSeq.parse_GFF_attribute_string(attrStr=attrStr) geneID = allAttr.get('gene_id', None) if geneID != None: origID = geneID if "." in geneID: geneID = geneID[:geneID.index(".")] if geneID in ret.id2pos: print("Duplicate gene ID", geneID, origID) start = int(line[3]) end = int(line[4]) strand = line[6] ret.chr2neighbours[chr].addi(start, end, geneID) ret.id2pos[geneID] = (chr, start, end, strand) addedGenes += 1 if addedGenes % 10000 == 0: print("Added Genes", addedGenes) print(addedGenes) return ret
def searchGeneName(self, annotationstring): if annotationstring == '.': genes = 'N/A' else: # Split the annotationstring by ',' which collapsed by bedtools groupby annotationstrings = annotationstring.split(',') collect = set() for annotation in annotationstrings: try: attr = HTSeq.parse_GFF_attribute_string(annotation) # Search for gene_name which is used by ensembl gtf annotation try: gene = attr['gene_name'] except KeyError: # Search for gene, which might used in GFF annotation try: gene = attr['gene'] except KeyError: # Search for gene_id try: gene = attr['gene_id'] except KeyError: try: gene = attr['transcript_id'] except KeyError: gene = 'N/A' except: gene = self.searchGeneName1(annotation) collect.add(gene) # Collapse all genes togethor if len(collect) > 1: try: collect.remove('N/A') except KeyError: pass genes = ','.join(collect) return genes
def searchGeneName(self,annotationstring): if annotationstring == '.': genes = 'N/A' else: # Split the annotationstring by ',' which collapsed by bedtools groupby annotationstrings = annotationstring.split(',') collect = set() for annotation in annotationstrings: try: attr = HTSeq.parse_GFF_attribute_string(annotation) # Search for gene_name which is used by ensembl gtf annotation try: gene = attr['gene_name'] except KeyError: # Search for gene, which might used in GFF annotation try: gene = attr['gene'] except KeyError: # Search for gene_id try: gene = attr['gene_id'] except KeyError: try: gene = attr['transcript_id'] except KeyError: gene = 'N/A' except: gene = self.searchGeneName1(annotation) collect.add(gene) # Collapse all genes togethor if len(collect) > 1: try: collect.remove('N/A') except KeyError: pass genes = ','.join(collect) return genes
def intersectcirc(self, circ_file, modified_gtf_file): # imput the result file of print_start_end_file import pybedtools #intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2 circ = pybedtools.BedTool(circ_file) gtf = pybedtools.BedTool(modified_gtf_file) intersectfile = circ.intersect(gtf,wa=True,wb=True,loj=True) # Store circExons as: circle start or end intervals as key, custom_exon_id as value circExons = {} for lin in intersectfile: lin_split = str(lin).split('\t') if lin_split[11].strip('\n') == '.': #lin_split[11] = '' pass else: circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), set() ).add( HTSeq.parse_GFF_attribute_string(lin_split[11])['custom_exon_id'] ) #circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) }) return circExons