def read_reference(self,file_ref): print "Read reference ..." #get file size file_size=os.stat(file_ref).st_size #Parse GTF file in_reference = open(file_ref) genes={} no_line = 0 current_position=0 for gtf_line in in_reference: no_line = no_line + 1 current_position+=len(gtf_line) #if (no_line< 1e5 and no_line % 1000 == 0) or (no_line<1e6 and no_line % 1e4 ==0) or (no_line>1e6 and no_line % 1e5 ==0) : if no_line % 1e5 ==0 : self.log_already_completed("{0} lines read from reference".format(no_line),file_size,current_position) if re.match("^#.*$",gtf_line): continue gtf_line = gtf_line.rstrip('\r\n') elmts = gtf_line.split('\t') gene_chr=elmts[0] gene_chr=gene_chr.lower().replace("chr","") start=int(elmts[3]) end=int(elmts[4]) if Join_with_gtf.debug and gene_chr != '1' : break feature=elmts[2] annot=elmts[8] me=re.match('^gene_id "([^"]+)".*$',annot) if me : gene_id=me.group(1) else : #Feature not related to a gene_id gene_id="" #sys.exit("Unable to find gene_id value on line #{0} of file '{1}'. Exiting".format(no_line,file_ref)) if feature == "gene": gene_start=start gene_end=end strand=elmts[6] if strand == "-" : strand=-1 elif strand == '+' : strand=1 else: sys.exit("Unexpected strand value on line #{0} of file '{1}' : '{2}'. Exiting".format(no_line,file_ref,strand)) if gene_id not in genes : gene=Gene(gene_id,gene_chr,gene_start,gene_end,strand) genes[gene_id]=gene else : gene=genes[gene_id] gene.set_location(gene_chr,gene_start,gene_end) gene.set_strand(strand) #gene start and end are defined in this line, therefore we can compute : #tss, promoter and tss self.features_found["promoter"]=1 self.features_found["tss"]=1 self.features_found["tts"]=1 self.features_found["gene"]=1 gene.gene_model_has_been_defined() #elif feature not in("CDS","UTR","transcript") : else : if gene_id not in genes : gene=Gene(gene_id,gene_chr) genes[gene_id]=gene else : gene=genes[gene_id] if feature == "start_codon" : self.features_found["utr5"]=1 elif feature == "stop_codon" : self.features_found["utr3"]=1 elif feature == "exon" : self.features_found["exon"]=1 self.features_found["intron"]=1 else : self.features_found[feature.lower()]=1 gene.add_feature(feature,start,end) in_reference.close() print "\n\t{0} lines read from reference in total.".format(no_line) #Check that all features listed in configuration file has been found at least once for feature in self.features_found : if self.features_found[feature.lower()] == 0 : sys.stderr.write(("Warning : feature named '{0}' found in 'feature_priorities' parameter. "+ "This feature has never been found in reference file '{1}'.\n").format( feature, file_ref )) #Complete feature_properties with the one found in gtf files but not requested by user #Otherwise when we will try to order feature overlapping with a given region #sorted(overlaps, key=lambda ovlp: self.feature_priorities[ ovlp.value[0] ]) #It will raise an exception. for feature in self.features_found : if feature.lower() not in self.feature_priorities : self.feature_priorities[feature.lower()]=None #define downstream/upstream boundaries promoter_downstream= self.theme.get_parameter("promoter_downstream") promoter_upstream= self.theme.get_parameter("promoter_upstream") tss_downstream= self.theme.get_parameter("tss_downstream") tss_upstream= self.theme.get_parameter("tss_upstream") tts_downstream= self.theme.get_parameter("tts_downstream") tts_upstream= self.theme.get_parameter("tts_upstream") #print "promoter_upstream={0}".format(promoter_upstream) #print "promoter_downstream={0}".format(promoter_downstream) #print "tss_upstream={0}".format(tss_upstream) #print "tss_downstream={0}".format(tss_downstream) #print "tts_upstream={0}".format(tts_upstream) #print "tts_downstream={0}".format(tts_downstream) #Initialize dictionnaries features={} gene_boundaries={} #Build gene model print "Build gene model ..." no_gene=0 for gene_id in genes : gene=genes[gene_id] (gene_chr,gene_start,gene_end)=gene.get_coordinates() no_gene+=1 if no_gene % 1000 == 0 : self.log_already_completed("{0} genes treated".format(no_gene),len(genes),no_gene) if gene_chr not in features : features[gene_chr]=IntervalTree() gene_boundaries[gene_chr]=IntervalTree() if gene.gene_model_is_defined() : if gene_chr not in gene_boundaries : gene_boundaries[gene_chr]=IntervalTree() gene_boundaries[gene_chr].insert_interval( Interval(gene_start,gene_end, value=["gene",gene_id] ) ) #Promoter if gene.strand == 1 : (start,end)=gene.get_promoter(promoter_upstream,promoter_downstream) else : (start,end)=gene.get_promoter(promoter_downstream,promoter_upstream) features[gene_chr].insert_interval( Interval(start,end, value=["promoter",gene_id] ) ) #5' UTR (start,end)=gene.get_utr5() if start is not None: features[gene_chr].insert_interval( Interval(start,end, value=["utr5",gene_id] ) ) #TTS if gene.strand == 1 : (start,end)=gene.get_tss(tss_upstream,tss_downstream) else : (start,end)=gene.get_tss(tss_downstream,tss_upstream) features[gene_chr].insert_interval( Interval(start,end, value=["tss",gene_id] ) ) #Intron / Exon (intron_coords,exon_coords)=gene.get_introns_exons() #Debug #if gene.gene_id == "ENSBTAG00000000010" : # print "gene_id '{0} / intron={1} / exon={2}".format(gene.gene_id,intron_coords,exon_coords) for exon_coord in exon_coords : (start,end)=exon_coord features[gene_chr].insert_interval( Interval(start,end, value=["exon",gene_id] ) ) for intron_coord in intron_coords : (start,end)=intron_coord features[gene_chr].insert_interval( Interval(start,end, value=["intron",gene_id] ) ) #TTS if gene.strand == 1 : (start,end)=gene.get_tts(tts_upstream,tts_downstream) else : (start,end)=gene.get_tts(tts_downstream,tts_upstream) features[gene_chr].insert_interval( Interval(start,end, value=["tts",gene_id] ) ) #3' UTR (start,end)=gene.get_utr3() if start is not None: features[gene_chr].insert_interval( Interval(start,end, value=["utr3",gene_id] ) ) #Other features for feature in gene.get_other_features() : (start,end,feature)=feature features[gene_chr].insert_interval( Interval(start,end, value=[feature,gene_id] ) ) print "\n\t{0} genes treated in total.".format(no_gene) return (features,gene_boundaries)
def read_reference(self, file_ref): print "Read reference ..." #get file size file_size = os.stat(file_ref).st_size #Parse GTF file in_reference = open(file_ref) genes = {} no_line = 0 current_position = 0 for gtf_line in in_reference: no_line = no_line + 1 current_position += len(gtf_line) #if (no_line< 1e5 and no_line % 1000 == 0) or (no_line<1e6 and no_line % 1e4 ==0) or (no_line>1e6 and no_line % 1e5 ==0) : if no_line % 1e5 == 0: self.log_already_completed( "{0} lines read from reference".format(no_line), file_size, current_position) if re.match("^#.*$", gtf_line): continue gtf_line = gtf_line.rstrip('\r\n') elmts = gtf_line.split('\t') gene_chr = elmts[0] gene_chr = gene_chr.lower().replace("chr", "") start = int(elmts[3]) end = int(elmts[4]) if Join_with_gtf.debug and gene_chr != '1': break feature = elmts[2] annot = elmts[8] me = re.match('^gene_id "([^"]+)".*$', annot) if me: gene_id = me.group(1) else: #Feature not related to a gene_id gene_id = "" #sys.exit("Unable to find gene_id value on line #{0} of file '{1}'. Exiting".format(no_line,file_ref)) if feature == "gene": gene_start = start gene_end = end strand = elmts[6] if strand == "-": strand = -1 elif strand == '+': strand = 1 else: sys.exit( "Unexpected strand value on line #{0} of file '{1}' : '{2}'. Exiting" .format(no_line, file_ref, strand)) if gene_id not in genes: gene = Gene(gene_id, gene_chr, gene_start, gene_end, strand) genes[gene_id] = gene else: gene = genes[gene_id] gene.set_location(gene_chr, gene_start, gene_end) gene.set_strand(strand) #gene start and end are defined in this line, therefore we can compute : #tss, promoter and tss self.features_found["promoter"] = 1 self.features_found["tss"] = 1 self.features_found["tts"] = 1 self.features_found["gene"] = 1 gene.gene_model_has_been_defined() #elif feature not in("CDS","UTR","transcript") : else: if gene_id not in genes: gene = Gene(gene_id, gene_chr) genes[gene_id] = gene else: gene = genes[gene_id] if feature == "start_codon": self.features_found["utr5"] = 1 elif feature == "stop_codon": self.features_found["utr3"] = 1 elif feature == "exon": self.features_found["exon"] = 1 self.features_found["intron"] = 1 else: self.features_found[feature.lower()] = 1 gene.add_feature(feature, start, end) in_reference.close() print "\n\t{0} lines read from reference in total.".format(no_line) #Check that all features listed in configuration file has been found at least once for feature in self.features_found: if self.features_found[feature.lower()] == 0: sys.stderr.write(( "Warning : feature named '{0}' found in 'feature_priorities' parameter. " + "This feature has never been found in reference file '{1}'.\n" ).format(feature, file_ref)) #Complete feature_properties with the one found in gtf files but not requested by user #Otherwise when we will try to order feature overlapping with a given region #sorted(overlaps, key=lambda ovlp: self.feature_priorities[ ovlp.value[0] ]) #It will raise an exception. for feature in self.features_found: if feature.lower() not in self.feature_priorities: self.feature_priorities[feature.lower()] = None #define downstream/upstream boundaries promoter_downstream = self.theme.get_parameter("promoter_downstream") promoter_upstream = self.theme.get_parameter("promoter_upstream") tss_downstream = self.theme.get_parameter("tss_downstream") tss_upstream = self.theme.get_parameter("tss_upstream") tts_downstream = self.theme.get_parameter("tts_downstream") tts_upstream = self.theme.get_parameter("tts_upstream") #print "promoter_upstream={0}".format(promoter_upstream) #print "promoter_downstream={0}".format(promoter_downstream) #print "tss_upstream={0}".format(tss_upstream) #print "tss_downstream={0}".format(tss_downstream) #print "tts_upstream={0}".format(tts_upstream) #print "tts_downstream={0}".format(tts_downstream) #Initialize dictionnaries features = {} gene_boundaries = {} #Build gene model print "Build gene model ..." no_gene = 0 for gene_id in genes: gene = genes[gene_id] (gene_chr, gene_start, gene_end) = gene.get_coordinates() no_gene += 1 if no_gene % 1000 == 0: self.log_already_completed("{0} genes treated".format(no_gene), len(genes), no_gene) if gene_chr not in features: features[gene_chr] = IntervalTree() gene_boundaries[gene_chr] = IntervalTree() if gene.gene_model_is_defined(): if gene_chr not in gene_boundaries: gene_boundaries[gene_chr] = IntervalTree() gene_boundaries[gene_chr].insert_interval( Interval(gene_start, gene_end, value=["gene", gene_id])) #Promoter if gene.strand == 1: (start, end) = gene.get_promoter(promoter_upstream, promoter_downstream) else: (start, end) = gene.get_promoter(promoter_downstream, promoter_upstream) features[gene_chr].insert_interval( Interval(start, end, value=["promoter", gene_id])) #5' UTR (start, end) = gene.get_utr5() if start is not None: features[gene_chr].insert_interval( Interval(start, end, value=["utr5", gene_id])) #TTS if gene.strand == 1: (start, end) = gene.get_tss(tss_upstream, tss_downstream) else: (start, end) = gene.get_tss(tss_downstream, tss_upstream) features[gene_chr].insert_interval( Interval(start, end, value=["tss", gene_id])) #Intron / Exon (intron_coords, exon_coords) = gene.get_introns_exons() #Debug #if gene.gene_id == "ENSBTAG00000000010" : # print "gene_id '{0} / intron={1} / exon={2}".format(gene.gene_id,intron_coords,exon_coords) for exon_coord in exon_coords: (start, end) = exon_coord features[gene_chr].insert_interval( Interval(start, end, value=["exon", gene_id])) for intron_coord in intron_coords: (start, end) = intron_coord features[gene_chr].insert_interval( Interval(start, end, value=["intron", gene_id])) #TTS if gene.strand == 1: (start, end) = gene.get_tts(tts_upstream, tts_downstream) else: (start, end) = gene.get_tts(tts_downstream, tts_upstream) features[gene_chr].insert_interval( Interval(start, end, value=["tts", gene_id])) #3' UTR (start, end) = gene.get_utr3() if start is not None: features[gene_chr].insert_interval( Interval(start, end, value=["utr3", gene_id])) #Other features for feature in gene.get_other_features(): (start, end, feature) = feature features[gene_chr].insert_interval( Interval(start, end, value=[feature, gene_id])) print "\n\t{0} genes treated in total.".format(no_gene) return (features, gene_boundaries)