Esempio n. 1
0
 def intersectcirc(self, circ_file, modified_gtf_file, strand=True):
     # imput the result file of print_start_end_file
     #intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2
     circ = pybedtools.BedTool(circ_file)
     gtf = pybedtools.BedTool(modified_gtf_file)
     if strand:
         intersectfile = circ.intersect(gtf,
                                        wa=True,
                                        wb=True,
                                        loj=True,
                                        s=True,
                                        nonamecheck=True)
     else:
         intersectfile = circ.intersect(gtf,
                                        wa=True,
                                        wb=True,
                                        loj=True,
                                        nonamecheck=True)
     # Store circExons as: circle start or end intervals as key, custom_exon_id as value
     circExons = {}
     for lin in intersectfile:
         lin_split = str(lin).split('\t')
         if lin_split[14].strip('\n') == '.':
             #lin_split[11] = ''
             pass
         else:
             circExons.setdefault(
                 HTSeq.GenomicInterval(lin_split[0], int(lin_split[1]),
                                       int(lin_split[2]), lin_split[5]),
                 set()).add(
                     HTSeq.parse_GFF_attribute_string(
                         lin_split[14])['custom_exon_id'])
         #circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) })
     return circExons
Esempio n. 2
0
    def loadFromFile(
        cls,
        org,
        inputgff="/mnt/c/ownCloud/data/miRExplore/obodir/mm10_primary_assembly_and_lncRNA.gtf"
    ):

        ret = GeneNeighbourDB(org)

        with open(inputgff, 'r') as fin:

            addedGenes = 0

            for line in fin:

                line = line.strip().split()

                #print(line)

                chr = line[0]
                type = line[2]

                if type != 'gene':
                    continue

                attrStr = "\t".join(line[8:])
                allAttr = HTSeq.parse_GFF_attribute_string(attrStr=attrStr)

                geneID = allAttr.get('gene_id', None)

                if geneID != None:

                    origID = geneID

                    if "." in geneID:
                        geneID = geneID[:geneID.index(".")]

                    if geneID in ret.id2pos:
                        print("Duplicate gene ID", geneID, origID)

                    start = int(line[3])
                    end = int(line[4])

                    strand = line[6]

                    ret.chr2neighbours[chr].addi(start, end, geneID)
                    ret.id2pos[geneID] = (chr, start, end, strand)

                    addedGenes += 1

                    if addedGenes % 10000 == 0:
                        print("Added Genes", addedGenes)

            print(addedGenes)

        return ret
Esempio n. 3
0
    def searchGeneName(self, annotationstring):
        if annotationstring == '.':
            genes = 'N/A'
        else:
            # Split the annotationstring by ',' which collapsed by bedtools groupby
            annotationstrings = annotationstring.split(',')
            collect = set()
            for annotation in annotationstrings:
                try:
                    attr = HTSeq.parse_GFF_attribute_string(annotation)
                    # Search for gene_name which is used by ensembl gtf annotation
                    try:
                        gene = attr['gene_name']
                    except KeyError:
                        # Search for gene, which might used in GFF annotation
                        try:
                            gene = attr['gene']
                        except KeyError:
                            # Search for gene_id
                            try:
                                gene = attr['gene_id']
                            except KeyError:
                                try:
                                    gene = attr['transcript_id']
                                except KeyError:
                                    gene = 'N/A'
                except:
                    gene = self.searchGeneName1(annotation)
                collect.add(gene)
            # Collapse all genes togethor
            if len(collect) > 1:
                try:
                    collect.remove('N/A')
                except KeyError:
                    pass
            genes = ','.join(collect)

        return genes
Esempio n. 4
0
 def searchGeneName(self,annotationstring):
     if annotationstring == '.':
         genes = 'N/A'
     else:
         # Split the annotationstring by ',' which collapsed by bedtools groupby
         annotationstrings = annotationstring.split(',')
         collect = set()
         for annotation in annotationstrings:
             try:
                 attr = HTSeq.parse_GFF_attribute_string(annotation)
                 # Search for gene_name which is used by ensembl gtf annotation
                 try:
                     gene = attr['gene_name']
                 except KeyError:
                     # Search for gene, which might used in GFF annotation
                     try:
                         gene = attr['gene']
                     except KeyError:
                         # Search for gene_id
                         try:
                             gene = attr['gene_id']
                         except KeyError:
                             try:
                                 gene = attr['transcript_id']
                             except KeyError:
                                 gene = 'N/A'
             except:
                 gene = self.searchGeneName1(annotation)
             collect.add(gene)
         # Collapse all genes togethor
         if len(collect) > 1:
             try:
                 collect.remove('N/A')
             except KeyError:
                 pass
         genes = ','.join(collect)
         
     return genes
Esempio n. 5
0
	def intersectcirc(self, circ_file, modified_gtf_file):
		# imput the result file of print_start_end_file
		import pybedtools
		#intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2
		circ = pybedtools.BedTool(circ_file)
		gtf = pybedtools.BedTool(modified_gtf_file)
		intersectfile = circ.intersect(gtf,wa=True,wb=True,loj=True)
		# Store circExons as: circle start or end intervals as key, custom_exon_id as value
		circExons = {}
		for lin in intersectfile:
			lin_split = str(lin).split('\t')
			if lin_split[11].strip('\n') == '.':
				#lin_split[11] = ''
				pass
			else:
				circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), set() ).add( HTSeq.parse_GFF_attribute_string(lin_split[11])['custom_exon_id'] )
			#circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) })
		return circExons