Example #1
0
	def read_reference(self,file_ref):
	
		print "Read reference ..."


		#get file size
		file_size=os.stat(file_ref).st_size

		#Parse GTF file
	        in_reference = open(file_ref)
	
		genes={}
		no_line = 0
		current_position=0
	        for gtf_line in in_reference:
			no_line = no_line + 1
			current_position+=len(gtf_line)

			#if (no_line< 1e5 and no_line % 1000 == 0) or (no_line<1e6 and no_line % 1e4 ==0) or (no_line>1e6 and no_line % 1e5 ==0) :
			if no_line % 1e5 ==0 :
				self.log_already_completed("{0} lines read from reference".format(no_line),file_size,current_position)
	
			if re.match("^#.*$",gtf_line):
				continue

	                gtf_line = gtf_line.rstrip('\r\n')
	
	                elmts = gtf_line.split('\t')
			gene_chr=elmts[0]
			gene_chr=gene_chr.lower().replace("chr","")
			start=int(elmts[3])
			end=int(elmts[4])

			if Join_with_gtf.debug and gene_chr != '1' :
				break

			feature=elmts[2]

			annot=elmts[8]
			me=re.match('^gene_id "([^"]+)".*$',annot)
			if me :
				gene_id=me.group(1)
			else :
				#Feature not related to a gene_id
				gene_id=""
				#sys.exit("Unable to find gene_id value  on line #{0} of file '{1}'. Exiting".format(no_line,file_ref))


			if feature == "gene":
				gene_start=start
				gene_end=end
				strand=elmts[6]

				if strand == "-" : strand=-1
				elif strand == '+' : strand=1
				else: sys.exit("Unexpected strand value on line #{0} of file '{1}' : '{2}'. Exiting".format(no_line,file_ref,strand))

				if gene_id not in genes :
					gene=Gene(gene_id,gene_chr,gene_start,gene_end,strand)
					genes[gene_id]=gene
				else :
					gene=genes[gene_id]
					gene.set_location(gene_chr,gene_start,gene_end)
					gene.set_strand(strand)
				#gene start and end are defined in this line, therefore we can compute :
				#tss, promoter and tss
				self.features_found["promoter"]=1
				self.features_found["tss"]=1
				self.features_found["tts"]=1
				self.features_found["gene"]=1
				gene.gene_model_has_been_defined()

			#elif feature not in("CDS","UTR","transcript") :
			else :
				if gene_id not in genes :
					gene=Gene(gene_id,gene_chr)
					genes[gene_id]=gene
				else :
					gene=genes[gene_id]
				if feature == "start_codon" :
					self.features_found["utr5"]=1
				elif feature == "stop_codon" :
					self.features_found["utr3"]=1
				elif feature == "exon" :
					self.features_found["exon"]=1
					self.features_found["intron"]=1
				else :
					self.features_found[feature.lower()]=1
				gene.add_feature(feature,start,end)

	        in_reference.close()
		print "\n\t{0} lines read from reference in total.".format(no_line)

		#Check that all features listed in configuration file has been found at least once
		for feature in self.features_found :
			if self.features_found[feature.lower()] == 0 :
				sys.stderr.write(("Warning : feature named '{0}' found in 'feature_priorities' parameter. "+
						  "This feature has never been found in reference file '{1}'.\n").format(
							feature, file_ref
				))

		#Complete feature_properties with the one found in gtf files but not requested by user
		#Otherwise when we will try to order feature overlapping with a given region
		#sorted(overlaps, key=lambda ovlp: self.feature_priorities[ ovlp.value[0] ])
		#It will raise an exception.
		for feature in self.features_found :
			if feature.lower() not in self.feature_priorities :
				self.feature_priorities[feature.lower()]=None

		#define downstream/upstream boundaries
		promoter_downstream= self.theme.get_parameter("promoter_downstream")
		promoter_upstream= self.theme.get_parameter("promoter_upstream")
		tss_downstream= self.theme.get_parameter("tss_downstream")
		tss_upstream= self.theme.get_parameter("tss_upstream")
		tts_downstream= self.theme.get_parameter("tts_downstream")
		tts_upstream= self.theme.get_parameter("tts_upstream")

		#print "promoter_upstream={0}".format(promoter_upstream)
		#print "promoter_downstream={0}".format(promoter_downstream)
		#print "tss_upstream={0}".format(tss_upstream)
		#print "tss_downstream={0}".format(tss_downstream)
		#print "tts_upstream={0}".format(tts_upstream)
		#print "tts_downstream={0}".format(tts_downstream)

		#Initialize dictionnaries
		features={}
		gene_boundaries={}

		#Build gene model
		print "Build gene model ..."
		no_gene=0
		for gene_id in genes :

			gene=genes[gene_id]
			(gene_chr,gene_start,gene_end)=gene.get_coordinates()

			no_gene+=1

			if no_gene % 1000 == 0 :
				self.log_already_completed("{0} genes treated".format(no_gene),len(genes),no_gene)

			if gene_chr not in features :
					features[gene_chr]=IntervalTree()
					gene_boundaries[gene_chr]=IntervalTree()

			if gene.gene_model_is_defined() :
				if gene_chr not in gene_boundaries :
					gene_boundaries[gene_chr]=IntervalTree()

				gene_boundaries[gene_chr].insert_interval( Interval(gene_start,gene_end, value=["gene",gene_id] ) )

				#Promoter
				if gene.strand == 1 :
					(start,end)=gene.get_promoter(promoter_upstream,promoter_downstream)
				else :
					(start,end)=gene.get_promoter(promoter_downstream,promoter_upstream)
				features[gene_chr].insert_interval( Interval(start,end, value=["promoter",gene_id] ) )
	
				#5' UTR
				(start,end)=gene.get_utr5()
				if start is not None:
					features[gene_chr].insert_interval( Interval(start,end, value=["utr5",gene_id] ) )
	
				#TTS
				if gene.strand == 1 :
					(start,end)=gene.get_tss(tss_upstream,tss_downstream)
				else :
					(start,end)=gene.get_tss(tss_downstream,tss_upstream)
				features[gene_chr].insert_interval( Interval(start,end, value=["tss",gene_id] ) )
	
				#Intron / Exon
				(intron_coords,exon_coords)=gene.get_introns_exons()

				#Debug
				#if gene.gene_id == "ENSBTAG00000000010" :
				#	print "gene_id '{0} / intron={1} / exon={2}".format(gene.gene_id,intron_coords,exon_coords)

				for exon_coord in exon_coords :
					(start,end)=exon_coord
					features[gene_chr].insert_interval( Interval(start,end, value=["exon",gene_id] ) )
	
				for intron_coord in intron_coords :
					(start,end)=intron_coord
					features[gene_chr].insert_interval( Interval(start,end, value=["intron",gene_id] ) )
	
				#TTS
				if gene.strand == 1 :
					(start,end)=gene.get_tts(tts_upstream,tts_downstream)
				else :
					(start,end)=gene.get_tts(tts_downstream,tts_upstream)
				features[gene_chr].insert_interval( Interval(start,end, value=["tts",gene_id] ) )
	
				#3' UTR
				(start,end)=gene.get_utr3()
				if start is not None:
					features[gene_chr].insert_interval( Interval(start,end, value=["utr3",gene_id] ) )
			
			#Other features
			for feature in gene.get_other_features() :
				(start,end,feature)=feature
				features[gene_chr].insert_interval( Interval(start,end, value=[feature,gene_id] ) )

		print "\n\t{0} genes treated in total.".format(no_gene)
	        return (features,gene_boundaries)
Example #2
0
    def read_reference(self, file_ref):

        print "Read reference ..."

        #get file size
        file_size = os.stat(file_ref).st_size

        #Parse GTF file
        in_reference = open(file_ref)

        genes = {}
        no_line = 0
        current_position = 0
        for gtf_line in in_reference:
            no_line = no_line + 1
            current_position += len(gtf_line)

            #if (no_line< 1e5 and no_line % 1000 == 0) or (no_line<1e6 and no_line % 1e4 ==0) or (no_line>1e6 and no_line % 1e5 ==0) :
            if no_line % 1e5 == 0:
                self.log_already_completed(
                    "{0} lines read from reference".format(no_line), file_size,
                    current_position)

            if re.match("^#.*$", gtf_line):
                continue

            gtf_line = gtf_line.rstrip('\r\n')

            elmts = gtf_line.split('\t')
            gene_chr = elmts[0]
            gene_chr = gene_chr.lower().replace("chr", "")
            start = int(elmts[3])
            end = int(elmts[4])

            if Join_with_gtf.debug and gene_chr != '1':
                break

            feature = elmts[2]

            annot = elmts[8]
            me = re.match('^gene_id "([^"]+)".*$', annot)
            if me:
                gene_id = me.group(1)
            else:
                #Feature not related to a gene_id
                gene_id = ""
                #sys.exit("Unable to find gene_id value  on line #{0} of file '{1}'. Exiting".format(no_line,file_ref))

            if feature == "gene":
                gene_start = start
                gene_end = end
                strand = elmts[6]

                if strand == "-": strand = -1
                elif strand == '+': strand = 1
                else:
                    sys.exit(
                        "Unexpected strand value on line #{0} of file '{1}' : '{2}'. Exiting"
                        .format(no_line, file_ref, strand))

                if gene_id not in genes:
                    gene = Gene(gene_id, gene_chr, gene_start, gene_end,
                                strand)
                    genes[gene_id] = gene
                else:
                    gene = genes[gene_id]
                    gene.set_location(gene_chr, gene_start, gene_end)
                    gene.set_strand(strand)
                #gene start and end are defined in this line, therefore we can compute :
                #tss, promoter and tss
                self.features_found["promoter"] = 1
                self.features_found["tss"] = 1
                self.features_found["tts"] = 1
                self.features_found["gene"] = 1
                gene.gene_model_has_been_defined()

            #elif feature not in("CDS","UTR","transcript") :
            else:
                if gene_id not in genes:
                    gene = Gene(gene_id, gene_chr)
                    genes[gene_id] = gene
                else:
                    gene = genes[gene_id]
                if feature == "start_codon":
                    self.features_found["utr5"] = 1
                elif feature == "stop_codon":
                    self.features_found["utr3"] = 1
                elif feature == "exon":
                    self.features_found["exon"] = 1
                    self.features_found["intron"] = 1
                else:
                    self.features_found[feature.lower()] = 1
                gene.add_feature(feature, start, end)

        in_reference.close()
        print "\n\t{0} lines read from reference in total.".format(no_line)

        #Check that all features listed in configuration file has been found at least once
        for feature in self.features_found:
            if self.features_found[feature.lower()] == 0:
                sys.stderr.write((
                    "Warning : feature named '{0}' found in 'feature_priorities' parameter. "
                    +
                    "This feature has never been found in reference file '{1}'.\n"
                ).format(feature, file_ref))

        #Complete feature_properties with the one found in gtf files but not requested by user
        #Otherwise when we will try to order feature overlapping with a given region
        #sorted(overlaps, key=lambda ovlp: self.feature_priorities[ ovlp.value[0] ])
        #It will raise an exception.
        for feature in self.features_found:
            if feature.lower() not in self.feature_priorities:
                self.feature_priorities[feature.lower()] = None

        #define downstream/upstream boundaries
        promoter_downstream = self.theme.get_parameter("promoter_downstream")
        promoter_upstream = self.theme.get_parameter("promoter_upstream")
        tss_downstream = self.theme.get_parameter("tss_downstream")
        tss_upstream = self.theme.get_parameter("tss_upstream")
        tts_downstream = self.theme.get_parameter("tts_downstream")
        tts_upstream = self.theme.get_parameter("tts_upstream")

        #print "promoter_upstream={0}".format(promoter_upstream)
        #print "promoter_downstream={0}".format(promoter_downstream)
        #print "tss_upstream={0}".format(tss_upstream)
        #print "tss_downstream={0}".format(tss_downstream)
        #print "tts_upstream={0}".format(tts_upstream)
        #print "tts_downstream={0}".format(tts_downstream)

        #Initialize dictionnaries
        features = {}
        gene_boundaries = {}

        #Build gene model
        print "Build gene model ..."
        no_gene = 0
        for gene_id in genes:

            gene = genes[gene_id]
            (gene_chr, gene_start, gene_end) = gene.get_coordinates()

            no_gene += 1

            if no_gene % 1000 == 0:
                self.log_already_completed("{0} genes treated".format(no_gene),
                                           len(genes), no_gene)

            if gene_chr not in features:
                features[gene_chr] = IntervalTree()
                gene_boundaries[gene_chr] = IntervalTree()

            if gene.gene_model_is_defined():
                if gene_chr not in gene_boundaries:
                    gene_boundaries[gene_chr] = IntervalTree()

                gene_boundaries[gene_chr].insert_interval(
                    Interval(gene_start, gene_end, value=["gene", gene_id]))

                #Promoter
                if gene.strand == 1:
                    (start, end) = gene.get_promoter(promoter_upstream,
                                                     promoter_downstream)
                else:
                    (start, end) = gene.get_promoter(promoter_downstream,
                                                     promoter_upstream)
                features[gene_chr].insert_interval(
                    Interval(start, end, value=["promoter", gene_id]))

                #5' UTR
                (start, end) = gene.get_utr5()
                if start is not None:
                    features[gene_chr].insert_interval(
                        Interval(start, end, value=["utr5", gene_id]))

                #TTS
                if gene.strand == 1:
                    (start, end) = gene.get_tss(tss_upstream, tss_downstream)
                else:
                    (start, end) = gene.get_tss(tss_downstream, tss_upstream)
                features[gene_chr].insert_interval(
                    Interval(start, end, value=["tss", gene_id]))

                #Intron / Exon
                (intron_coords, exon_coords) = gene.get_introns_exons()

                #Debug
                #if gene.gene_id == "ENSBTAG00000000010" :
                #	print "gene_id '{0} / intron={1} / exon={2}".format(gene.gene_id,intron_coords,exon_coords)

                for exon_coord in exon_coords:
                    (start, end) = exon_coord
                    features[gene_chr].insert_interval(
                        Interval(start, end, value=["exon", gene_id]))

                for intron_coord in intron_coords:
                    (start, end) = intron_coord
                    features[gene_chr].insert_interval(
                        Interval(start, end, value=["intron", gene_id]))

                #TTS
                if gene.strand == 1:
                    (start, end) = gene.get_tts(tts_upstream, tts_downstream)
                else:
                    (start, end) = gene.get_tts(tts_downstream, tts_upstream)
                features[gene_chr].insert_interval(
                    Interval(start, end, value=["tts", gene_id]))

                #3' UTR
                (start, end) = gene.get_utr3()
                if start is not None:
                    features[gene_chr].insert_interval(
                        Interval(start, end, value=["utr3", gene_id]))

            #Other features
            for feature in gene.get_other_features():
                (start, end, feature) = feature
                features[gene_chr].insert_interval(
                    Interval(start, end, value=[feature, gene_id]))

        print "\n\t{0} genes treated in total.".format(no_gene)
        return (features, gene_boundaries)