def get_features(self, cluster_number): left_iv = self.left.iv.copy() left_iv.strand = "+" left_feature = HTSeq.GenomicFeature( "cluster_{0}_left".format(cluster_number), CLUSTER_GFF_TYPE, left_iv) left_feature.score = self.left.count right_iv = self.right.iv.copy() right_iv.strand = "-" right_feature = HTSeq.GenomicFeature( "cluster_{0}_right".format(cluster_number), CLUSTER_GFF_TYPE, right_iv) right_feature.score = self.right.count if self.gap == 0: gap_iv = HTSeq.GenomicInterval(left_iv.chrom, left_iv.end - 1, right_iv.start + 1, ".") else: # gap_iv = HTSeq.GenomicInterval( left_iv.chrom, left_iv.end+1, right_iv.start-1, "." ) gap_iv = HTSeq.GenomicInterval(left_iv.chrom, left_iv.end, right_iv.start, ".") insert_name = "cluster_{0}_insert".format(cluster_number) insert_feature = HTSeq.GenomicFeature(insert_name, GAP_GFF_TYPE, gap_iv) # print dir(insert_feature) # insert_feature.__setattribute__("length",self.gap) insert_feature.attr = {"ID": insert_name, "length": self.gap} return (left_feature, insert_feature, right_feature)
def get_features(self, cluster_number): singleton_iv = self.singleton.iv.copy() insert_iv = None if self.singleton.insertion_side == RIGHT: singleton_iv.strand = "+" # insert_iv = HTSeq.GenomicInterval(singleton_iv.chrom, singleton_iv.end+1, singleton_iv.end+1, "." ) insert_iv = HTSeq.GenomicInterval(singleton_iv.chrom, singleton_iv.end, singleton_iv.end + 1, ".") elif self.singleton.insertion_side == LEFT: singleton_iv.strand = "-" # insert_iv =HTSeq.GenomicInterval(singleton_iv.chrom, singleton_iv.start-1, singleton_iv.start-1, "." ) insert_iv = HTSeq.GenomicInterval(singleton_iv.chrom, singleton_iv.start - 1, singleton_iv.start, ".") else: singleton_iv.strand = "." singleton_feature = HTSeq.GenomicFeature( "cluster_{0}_singleton".format(cluster_number), CLUSTER_GFF_TYPE, singleton_iv) singleton_feature.score = self.singleton.count feature_list = [singleton_feature] if insert_iv != None: insert_feature = HTSeq.GenomicFeature( "cluster_{0}_junction".format(cluster_number), SINGLETON_INSERT_GFF_TYPE, insert_iv) feature_list.append(insert_feature) return feature_list
def collapseSortedGF(ll): gf_new = None l_out = [] i = 0 # loop through list for feature in ll: if gf_new is None: #gf_new = feature gf_new = HTSeq.GenomicFeature(feature.name, feature.type, feature.iv.copy()) gf_new.attr = feature.attr.copy() else: if feature.iv.overlaps(gf_new.iv): # features overlap, merge them gfMerge(gf_new, feature) else: # features don't overlap so append the completed "new" feature # to the output list l_out.append(gf_new) # start new "new" feature #gf_new = feature gf_new = HTSeq.GenomicFeature(feature.name, feature.type, feature.iv.copy()) gf_new.attr = feature.attr.copy() # append final "new" feature l_out.append(gf_new) return l_out
def create_peak_gtf(path, exp_design_name, technique, bed_name): """ Read all PATH_PEAKS+'/'+exp_design_name+'_'+technique+'_'+Final.txt Combine peaks and save to GFF :param list_technique: :return: """ PATH_ANNOT = path + '/Genome/' if technique == '' or technique == 'All': PATH_PEAKS = path + '/PeakDetection/Peaks' peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '_Peaks.txt' gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '.gtf' else: PATH_PEAKS = path + '/PeakDetection/' + technique + '/' peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '_Peaks.txt' gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '.gtf' with open(gtf_filename, 'w') as gtf_file, \ open(peak_filename, 'rU') as peak_file: csv_peaks = csv.DictReader(peak_file, delimiter='\t') for row in csv_peaks: peak = HTSeq.GenomicInterval(row['chromo_peak'], int(row['begin_peak']), int(row['end_peak']), ".") peak_id = row['WindowId'] feature = HTSeq.GenomicFeature(peak_id, 'exon', peak) #print(feature.get_gff_line().strip() + '; gene_id \"'+peak_id+'\"') gtf_file.write(feature.get_gff_line().strip() + '; gene_id \"' + peak_id + '\"' + '\n')
def parseGtf(sz_file, d_g): ## # variables szTid = "" # create reader for GTF gr = HTSeq.GFF_Reader(sz_file) # read through the GTF file and load into dict for feature in gr: if feature.type == "exon": szTid = feature.attr['transcript_id'] feature.name = szTid # add to dict if szTid not in d_g: d_g[szTid] = {'exons': [], 'feature': None} # append current feature to transcript's exon list # kill strand feature.iv.strand = "." d_g[szTid]['exons'].append(feature) # update feature's genomic interval if d_g[szTid]['feature'] is None: d_g[szTid]['feature'] = HTSeq.GenomicFeature( feature.name, "gene", feature.iv.copy()) d_g[szTid]['feature'].attr = feature.attr.copy() else: d_g[szTid]['feature'].iv.start = min( d_g[szTid]['feature'].iv.start, feature.iv.start) d_g[szTid]['feature'].iv.end = max( d_g[szTid]['feature'].iv.end, feature.iv.end)
def _parse_gtf_features(self, transcript_id): self.feature_set = set() names = ['chrom','source','feature','start','end','score','strand','frame', 'attribute'] gtf_data = pd.read_table('chr1.gtf', sep='\t', comment='#', names=names) print 'done reading' transcript_series = gtf_data['attribute'].apply(self._filter_enst, args=(transcript_id,)) gtf_data = gtf_data.ix[transcript_series] for r in gtf_data.iterrows(): r = r[1] # iterrows is a tuple of (index, Series) self.feature_set.add(HTSeq.GenomicFeature(transcript_id, r.feature, HTSeq.GenomicInterval(r.chrom, r.start, r.end, r.strand)))
if aggregateGenes == False: check_set = set() for geneID, transcript_id in s: check_set.add(geneID) if (len(check_set) > 1): continue else: aggregate_id = gene_id # Take one of the gene IDs, find the others via gene sets, and # form the aggregate ID from all of them else: assert set(gene_id for gene_id, transcript_id in s) <= gene_sets[gene_id] aggregate_id = '+'.join(gene_sets[gene_id]) # Make the feature and store it in 'aggregates' f = HTSeq.GenomicFeature(aggregate_id, "exonic_part", iv) f.source = os.path.basename(sys.argv[0]) # f.source = "camara" f.attr = {} f.attr['gene_id'] = aggregate_id transcript_set = set((transcript_id for gene_id, transcript_id in s)) f.attr['transcripts'] = '+'.join(transcript_set) aggregates[aggregate_id].append(f) # Step 4: For each aggregate, number the exonic parts aggregate_features = [] for l in aggregates.values(): for i in range(len(l) - 1): assert l[i].name == l[i + 1].name, str(l[i + 1]) + " has wrong name" assert l[i].iv.end <= l[i + 1].iv.start, str(
#Annotate promoters promoters = [] for iv, s in promoter_part.steps(): if len(s) == 0: continue if iv.strand == "+": new_iv = HTSeq.GenomicInterval(iv.chrom, iv.start - 2000, iv.start, iv.strand) else: new_iv = HTSeq.GenomicInterval(iv.chrom, iv.end, iv.end + 2000, iv.strand) for g_id in s: gene_id = g_id promoter = HTSeq.GenomicFeature(gene_id, "promoter", new_iv) promoter.attr = {'gene_id': gene_id} promoters.append(promoter) promoters.sort(key=lambda promoter: (promoter.iv.chrom, promoter.iv.start)) fout = open(promoter_file, "w") for promoter in promoters: fout.write(promoter.get_gff_line()) fout.close() #Annotate introns introns = [] for iv, s in intron_part.steps(): if len(s) == 0: continue iv = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end, iv.strand) for g_id in s:
def run(args): exons = collections.defaultdict( lambda: HTSeq.GenomicArrayOfSets("auto", stranded=True)) gene_region = {} gene_region_length = collections.Counter() transcript_region = collections.defaultdict(lambda: dict()) start_codon_region = collections.defaultdict(lambda: dict()) stop_codon_region = collections.defaultdict(lambda: dict()) CDS_region = collections.defaultdict(lambda: dict()) five_UTR_region = collections.defaultdict(lambda: dict()) three_UTR_region = collections.defaultdict(lambda: dict()) # Read features from the input GTF file. gtffile = HTSeq.GFF_Reader(args.inputfile, end_included=True) gtffile = filter( lambda feature: re.search(r'chr[a-zA-Z0-9]+$', feature.iv.chrom), gtffile) bad_gene_list = find_bad_genes(gtffile) logging.info( "Removing genes with exons in different chromosomes or strands (%i discarded)" % len(bad_gene_list)) gtffile = filter( lambda feature: feature.attr['gene_id'] not in bad_gene_list, gtffile) for feature in gtffile: if feature.type == "exon": gene_id = feature.attr["gene_id"] exons[gene_id][feature.iv] += feature.attr["transcript_id"] extend_transcript_region(feature, transcript_region) elif feature.type == "start_codon": gene_id = feature.attr["gene_id"] transcript_id = feature.attr["transcript_id"] start_codon_region[gene_id][transcript_id] = feature.iv elif feature.type == "stop_codon": gene_id = feature.attr["gene_id"] transcript_id = feature.attr["transcript_id"] stop_codon_region[gene_id][transcript_id] = feature.iv gene_region = find_gene_region(transcript_region) gene_region_length = find_gene_region_length(gene_region, transcript_region) (CDS_region, five_UTR_region, three_UTR_region) = find_CDS_and_UTR_region(start_codon_region, stop_codon_region, transcript_region) introns = collections.defaultdict( lambda: HTSeq.GenomicArrayOfSets("auto", stranded=True)) for gene_id in transcript_region.keys(): for transcript_id in transcript_region[gene_id].keys(): transcript_iv = transcript_region[gene_id][transcript_id] for iv, step_set in exons[gene_id][transcript_iv].steps(): if transcript_id not in step_set: introns[gene_id][iv] += transcript_id # gene_exons_bins redefines the exons in one gene. All exons in the gene region are split into exon bins. # Each exon bin is a feature (feature type is "exonic_region"), which has attributes: "gene_id" and "transcripts". # One exon bin is possibly shared by multiple transcripts. gene_exons_bins = collections.defaultdict(lambda: list()) for gene_id in gene_region.keys(): gene_iv = gene_region[gene_id] for iv, step_set in exons[gene_id][gene_iv].steps(): transcript_list = list(step_set) if len(transcript_list) != 0: feature = HTSeq.GenomicFeature(gene_id, "exonic_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcripts"] = "+".join(transcript_list) gene_exons_bins[gene_id].append(feature) if gene_iv.strand == "-": gene_exons_bins[gene_id] = gene_exons_bins[gene_id][::-1] # Number the exon bins with attrubute "exonic_region_number" starting from "001". for exons_bins_list in gene_exons_bins.values(): for i in xrange(len(exons_bins_list)): exons_bins_list[i].attr["exonic_region_number"] = "%03d" % (i + 1) # gene_introns_bins redefines the introns in one gene. All introns in the gene region are split into intron bins. # Each intron bin is a feature (feature type is "intronic_region"), which has attributes: "gene_id" and "transcripts". # One intron bin is possibly shared by multiple transcripts. If it isn't shared by one transcript, this intron bin # either overlaps with the exonic region of that transcript or lies outside of the whole region of that transcript. gene_introns_bins = collections.defaultdict(lambda: list()) for gene_id in gene_region.keys(): gene_iv = gene_region[gene_id] for iv, step_set in introns[gene_id][gene_iv].steps(): transcript_list = list(step_set) if len(transcript_list) != 0: feature = HTSeq.GenomicFeature(gene_id, "intronic_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcripts"] = "+".join(transcript_list) gene_introns_bins[gene_id].append(feature) if gene_iv.strand == "-": gene_introns_bins[gene_id] = gene_introns_bins[gene_id][::-1] # Number the intron bins with attrubute "intronic_region_number" starting from "001". for introns_bins_list in gene_introns_bins.values(): for i in xrange(len(introns_bins_list)): introns_bins_list[i].attr["intronic_region_number"] = "%03d" % (i + 1) # gene_constitutive_exons_bins defines that kind of exon bins that shared by all the transcripts in one gene. # Each constitutive exon bin is a feature (feature type is "constitutive_exonic_region"), which has attribute: "gene_id". logging.info("Generating constitutive exonic region (CER) annotation") gene_constitutive_exons_bins = collections.defaultdict(lambda: list()) gene_constitutive_exons_start_d = collections.defaultdict(lambda: set()) gene_constitutive_exons_end_d = collections.defaultdict(lambda: set()) gene_constitutive_exons_length = collections.Counter() gene_constitutive_exons_number = collections.Counter() for gene_id in gene_region.keys(): transcripts_in_gene = len(transcript_region[gene_id]) gene_iv = gene_region[gene_id] for iv, step_set in exons[gene_id][gene_iv].steps(): transcript_list = list(step_set) if len(transcript_list) == transcripts_in_gene: feature = HTSeq.GenomicFeature(gene_id, "constitutive_exonic_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id gene_constitutive_exons_bins[gene_id].append(feature) gene_constitutive_exons_start_d[gene_id].add( feature.iv.start_d_as_pos) gene_constitutive_exons_end_d[gene_id].add( feature.iv.end_d_as_pos) gene_constitutive_exons_length[gene_id] += feature.iv.length gene_constitutive_exons_number[gene_id] += 1 if gene_iv.strand == "-": gene_constitutive_exons_bins[ gene_id] = gene_constitutive_exons_bins[gene_id][::-1] for constitutive_exons_bins_list in gene_constitutive_exons_bins.values(): for i in xrange(len(constitutive_exons_bins_list)): constitutive_exons_bins_list[i].attr[ "constitutive_exonic_region_number"] = "%03d" % (i + 1) # gene_constitutive_introns_bins defines that kind of intron bins that shared by all the transcripts in one gene. # Each constitutive intron bin is a feature (feature type is "constitutive_intronic_region"), which has attribute: "gene_id". # For those intron bins in single transcript gene, if the intron bin is in 5' UTR, it will have attribute: "five_UTR_constitutive_intron"; # If the intron bin is in 3' UTR, it will have attribute: "three_UTR_constitutive_intron". # Didn't define "five_UTR_constitutive_intron" or "three_UTR_constitutive_intron" for intron bins in multiple transcripts gene yet. logging.info("Generating constitutive intronic region (CIR) annotation") gene_constitutive_introns_bins = collections.defaultdict(lambda: list()) gene_constitutive_introns_start_d = collections.defaultdict(lambda: set()) gene_constitutive_introns_end_d = collections.defaultdict(lambda: set()) gene_constitutive_introns_length = collections.Counter() gene_constitutive_introns_number = collections.Counter() for gene_id in gene_region.keys(): transcripts_in_gene = len(transcript_region[gene_id]) gene_iv = gene_region[gene_id] exist_UTR_regions = False if transcripts_in_gene == 1 and gene_id in start_codon_region.keys(): assert len(start_codon_region[gene_id]) == len( stop_codon_region[gene_id]) == 1 transcript_id = start_codon_region[gene_id].keys()[0] start_codon_region_iv = start_codon_region[gene_id].values()[0] stop_codon_region_iv = stop_codon_region[gene_id].values()[0] (five_UTR_region_iv, three_UTR_region_iv) = find_UTR_region_iv( start_codon_region_iv, stop_codon_region_iv, transcript_region[gene_id][transcript_id]) exist_UTR_regions = True for iv, step_set in introns[gene_id][gene_iv].steps(): transcript_list = list(step_set) if len(transcript_list) == transcripts_in_gene: feature = HTSeq.GenomicFeature(gene_id, "constitutive_intronic_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id if exist_UTR_regions == True: if feature.iv.is_contained_in(five_UTR_region_iv): feature.attr[ "five_UTR_constitutive_intron"] = "five_UTR_constitutive_intron" elif feature.iv.is_contained_in(three_UTR_region_iv): feature.attr[ "three_UTR_constitutive_intron"] = "three_UTR_constitutive_intron" gene_constitutive_introns_bins[gene_id].append(feature) gene_constitutive_introns_start_d[gene_id].add( feature.iv.start_d_as_pos) gene_constitutive_introns_end_d[gene_id].add( feature.iv.end_d_as_pos) gene_constitutive_introns_length[gene_id] += feature.iv.length gene_constitutive_introns_number[gene_id] += 1 if gene_iv.strand == "-": gene_constitutive_introns_bins[ gene_id] = gene_constitutive_introns_bins[gene_id][::-1] five_UTR_constitutive_introns = collections.defaultdict(lambda: list()) three_UTR_constitutive_introns = collections.defaultdict(lambda: list()) for constitutive_introns_bins_list in gene_constitutive_introns_bins.values( ): for i in xrange(len(constitutive_introns_bins_list)): gene_id = constitutive_introns_bins_list[i].attr["gene_id"] constitutive_intronic_region_number = constitutive_introns_bins_list[ i].attr["constitutive_intronic_region_number"] = "%03d" % (i + 1) if "five_UTR_constitutive_intron" in constitutive_introns_bins_list[ i].attr.keys(): five_UTR_constitutive_introns[gene_id].append( constitutive_intronic_region_number) elif "three_UTR_constitutive_intron" in constitutive_introns_bins_list[ i].attr.keys(): three_UTR_constitutive_introns[gene_id].append( constitutive_intronic_region_number) # gene_constitutive_junction defines that kind of junction positions that join constitutive exon bin and constitutive intron bin in one gene. # Each constitutive junction is a feature (feature type is "constitutive_junction"), which has attributes: "gene_id", "constitutive_junction_type", "upstream", "downstream". # attr["constitutive_junction_type"] can be the value: "5'_splice_junction", which means the upstream of the junction position is a constitutive exon bin, # and the downstream of the junction position is a constitutive intron bin. In this case, attr["upstream"] will be like "constitutive_exonic_region_number 002" (shows exactly which # constitutive exon bin in the upstream), and similarly for attr["downstream"]. # On the other hand, attr["constitutive_junction_type"] can be the value: "3'_splice_junction" logging.info("Generating constitutive junctions (CJ) annotation") gene_constitutive_junction = collections.defaultdict(lambda: list()) for gene_id in gene_constitutive_exons_start_d.keys(): if gene_id in gene_constitutive_introns_start_d.keys(): gene_constitutive_junction_from_exon_to_intron_set = gene_constitutive_exons_end_d[ gene_id] & gene_constitutive_introns_start_d[gene_id] for gene_constitutive_junction_pos in gene_constitutive_junction_from_exon_to_intron_set: feature = HTSeq.GenomicFeature(gene_id, "constitutive_junction", gene_constitutive_junction_pos) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id from_region_number = find_region_number( gene_constitutive_junction_pos, gene_constitutive_exons_bins[gene_id], "end_d_as_pos") feature.attr[ "upstream"] = "constitutive_exonic_region_number " + from_region_number to_region_number, index = find_region_number_and_index( gene_constitutive_junction_pos, gene_constitutive_introns_bins[gene_id], "start_d_as_pos") feature.attr[ "downstream"] = "constitutive_intronic_region_number " + to_region_number feature.attr[ "constitutive_junction_type"] = "5'_splice_junction" gene_constitutive_junction[gene_id].append((feature, index)) gene_constitutive_junction_from_intron_to_exon_set = gene_constitutive_exons_start_d[ gene_id] & gene_constitutive_introns_end_d[gene_id] for gene_constitutive_junction_pos in gene_constitutive_junction_from_intron_to_exon_set: feature = HTSeq.GenomicFeature(gene_id, "constitutive_junction", gene_constitutive_junction_pos) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id from_region_number, index = find_region_number_and_index( gene_constitutive_junction_pos, gene_constitutive_introns_bins[gene_id], "end_d_as_pos") feature.attr[ "upstream"] = "constitutive_intronic_region_number " + from_region_number to_region_number = find_region_number( gene_constitutive_junction_pos, gene_constitutive_exons_bins[gene_id], "start_d_as_pos") feature.attr[ "downstream"] = "constitutive_exonic_region_number " + to_region_number feature.attr[ "constitutive_junction_type"] = "3'_splice_junction" gene_constitutive_junction[gene_id].append((feature, index)) if len(gene_constitutive_junction[gene_id]) > 0: gene_strand = gene_constitutive_junction[gene_id][0][0].iv.strand if gene_strand == "+": gene_constitutive_junction[gene_id].sort( key=lambda f: (f[0].iv.chrom, f[0].iv.start)) else: gene_constitutive_junction[gene_id].sort( key=lambda f: (f[0].iv.chrom, -f[0].iv.start)) for gene_constitutive_junction_list in gene_constitutive_junction.values(): for i in xrange(len(gene_constitutive_junction_list)): gene_constitutive_junction_list[i][0].attr[ "constitutive_junction_number"] = "%03d" % (i + 1) feature = gene_constitutive_junction_list[i][0] gene_id = feature.attr["gene_id"] constitutive_junction_type = feature.attr[ "constitutive_junction_type"] if constitutive_junction_type == "5'_splice_junction": index = gene_constitutive_junction_list[i][1] gene_constitutive_introns_bins[gene_id][index].attr[ "upstream_constitutive_junction_number"] = "%03d" % (i + 1) elif constitutive_junction_type == "3'_splice_junction": index = gene_constitutive_junction_list[i][1] gene_constitutive_introns_bins[gene_id][index].attr[ "downstream_constitutive_junction_number"] = "%03d" % (i + 1) # gene_region_features defines a feature for each gene that summarize some key info. feature iv is the gene region. # Feature type is "gene_region". Each feature has attributes: "gene_id", "transcripts_in_gene" (count how many transcripts in this gene), # "gene_region_length", "constitutive_exonic_region_length", "constitutive_intronic_region_length". # For single transcript gene, if any of constitutive intron bins in 5' UTR, it will have attribute "five_UTR_constitutive_introns" # (value would be like "001,002", list the region numbers); similarly for 3' UTR. gene_region_features = [] for gene_id in gene_region.keys(): iv = gene_region[gene_id] feature = HTSeq.GenomicFeature(gene_id, "gene_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcripts_in_gene"] = len(transcript_region[gene_id]) feature.attr["gene_region_length"] = gene_region_length[gene_id] feature.attr[ "constitutive_exonic_region_length"] = gene_constitutive_exons_length[ gene_id] feature.attr[ "constitutive_intronic_region_length"] = gene_constitutive_introns_length[ gene_id] feature.attr[ "constitutive_exonic_region_number"] = gene_constitutive_exons_number[ gene_id] feature.attr[ "constitutive_intronic_region_number"] = gene_constitutive_introns_number[ gene_id] if gene_id in five_UTR_constitutive_introns.keys(): feature.attr["five_UTR_constitutive_introns"] = ",".join( five_UTR_constitutive_introns[gene_id]) if gene_id in three_UTR_constitutive_introns.keys(): feature.attr["three_UTR_constitutive_introns"] = ",".join( three_UTR_constitutive_introns[gene_id]) gene_region_features.append(feature) gene_region_features.sort(key=lambda f: (f.iv.chrom, f.iv.start)) # transcript_region_features defines a feature for each transcript. feature iv is the transcript region. # Feature type is "transcript_region". Each feature has attributes: "gene_id", "transcript_id". transcript_region_features = collections.defaultdict(lambda: list()) for gene_id in transcript_region.keys(): for transcript_id in transcript_region[gene_id].keys(): iv = transcript_region[gene_id][transcript_id] feature = HTSeq.GenomicFeature(gene_id, "transcript_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcript_id"] = transcript_id transcript_region_features[gene_id].append(feature) CDS_region_features = collections.defaultdict(lambda: list()) for gene_id in CDS_region.keys(): for transcript_id in CDS_region[gene_id].keys(): iv = CDS_region[gene_id][transcript_id] feature = HTSeq.GenomicFeature(gene_id, "CDS_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcript_id"] = transcript_id CDS_region_features[gene_id].append(feature) five_UTR_region_features = collections.defaultdict(lambda: list()) for gene_id in five_UTR_region.keys(): for transcript_id in five_UTR_region[gene_id].keys(): iv = five_UTR_region[gene_id][transcript_id] feature = HTSeq.GenomicFeature(gene_id, "five_UTR_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcript_id"] = transcript_id five_UTR_region_features[gene_id].append(feature) three_UTR_region_features = collections.defaultdict(lambda: list()) for gene_id in three_UTR_region.keys(): for transcript_id in three_UTR_region[gene_id].keys(): iv = three_UTR_region[gene_id][transcript_id] feature = HTSeq.GenomicFeature(gene_id, "three_UTR_region", iv) feature.source = "IR_annotation" feature.attr = {} feature.attr["gene_id"] = gene_id feature.attr["transcript_id"] = transcript_id three_UTR_region_features[gene_id].append(feature) # Write all newly defined features into new gtf annotation file. logging.info("Writing annotation to file: %s" % os.path.join(args.outdir, args.annofile)) f = open(os.path.join(args.outdir, args.annofile), "w") for gene_region_feature in gene_region_features: f.write(gene_region_feature.get_gff_line()) gene_id = gene_region_feature.attr["gene_id"] for feature in transcript_region_features[gene_id]: f.write(feature.get_gff_line()) for feature in CDS_region_features[gene_id]: f.write(feature.get_gff_line()) for feature in five_UTR_region_features[gene_id]: f.write(feature.get_gff_line()) for feature in three_UTR_region_features[gene_id]: f.write(feature.get_gff_line()) for feature in gene_exons_bins[gene_id]: f.write(feature.get_gff_line()) for feature in gene_introns_bins[gene_id]: f.write(feature.get_gff_line()) for feature in gene_constitutive_exons_bins[gene_id]: f.write(feature.get_gff_line()) for feature in gene_constitutive_introns_bins[gene_id]: f.write(feature.get_gff_line()) for feature in [ item[0] for item in gene_constitutive_junction[gene_id] ]: f.write(feature.get_gff_line()) f.close()
def create_sliding_exon_window_GTF(windowSize): overlap = windowSize / 2 gtf_file = HTSeq.GFF_Reader( PATH_ANNOT + "/gencodeVM13/gencode.vM13.annotation.exon.gtf", end_included=True) windows = HTSeq.GenomicArrayOfSets("auto", stranded=True) transcriptID = 1 with open(PATH_ANNOT + "/gencodeVM13/gencode.vM13.exon.slidingwindow.gtf", "w") as slidingGTF: for feature in gtf_file: if feature.type == 'exon': interval = feature.iv transcriptID += 1 if transcriptID % 1000 == 0: print('Gene: ' + str(transcriptID) + '/ 100000') windowID = 1 if interval.strand == '+': begin = interval.start_d end = begin + windowSize while end < interval.end_d: window = HTSeq.GenomicInterval(interval.chrom, begin, end + 1, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['transcript_name'] + '_' + feature.attr['exon_number'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) windowID += 1 #print(featureWindow.get_gff_line()) begin += overlap end = begin + windowSize slidingGTF.write(featureWindow.get_gff_line()) end = interval.end_d window = HTSeq.GenomicInterval(interval.chrom, begin, end, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['transcript_name'] + '_' + feature.attr['exon_number'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) if window.length > 1: slidingGTF.write(featureWindow.get_gff_line()) else: #print(str(interval.end_d) + ' ' + str(interval.start_d)) begin = interval.start_d end = begin - windowSize while end > interval.end_d: window = HTSeq.GenomicInterval(interval.chrom, end, begin + 1, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['transcript_name'] + '_' + feature.attr['exon_number'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) windowID += 1 #print(featureWindow.get_gff_line()) begin -= overlap end = begin - windowSize slidingGTF.write(featureWindow.get_gff_line()) end = interval.end_d window = HTSeq.GenomicInterval(interval.chrom, end + 1, begin, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['transcript_name'] + '_' + feature.attr['exon_number'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) if window.length > 1: slidingGTF.write(featureWindow.get_gff_line())
def main(): """Main function.""" optParser = optparse.OptionParser( usage="python %prog [options] <in.gtf> <out.gff>", description=( "Script to prepare annotation for DEXSeq." "This script takes an annotation file in Ensembl GTF format" "and outputs a 'flattened' annotation file suitable for use " "with the count_in_exons.py script " ), epilog=( "Written by Simon Anders ([email protected]), European Molecular Biology " "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " "Public License v3. Part of the 'DEXSeq' package. " "Modified by Vivek Bhardwaj (just a bit!) to write featurecounts gtf as an option. " "Modified by Jost Vrabic Koren to work with python 3. " ) ) optParser.add_option( "-r", "--aggregate", type="choice", dest="aggregate", choices=("no", "yes"), default="yes", help=( "'yes' or 'no'. Indicates whether two or more genes sharing an exon should be merged" " into an 'aggregate gene'. If 'no', the exons that can not be assiged to a single gene" " are ignored." ) ) # add option for featurecounts output optParser.add_option("-f", "--featurecountsgtf", type="string", dest="fcgtf", action="store", help="gtf file to write for featurecounts.") ## (opts, args) = optParser.parse_args() if len(args) != 2: sys.stderr.write("Script to prepare annotation for DEXSeq.\n\n") sys.stderr.write("Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0])) sys.stderr.write("This script takes an annotation file in Ensembl GTF format\n") sys.stderr.write("and outputs a 'flattened' annotation file suitable for use\n") sys.stderr.write("with the count_in_exons.py script.\n") sys.exit(1) try: import HTSeq except ImportError: sys.stderr.write("Could not import HTSeq. Please install the HTSeq Python framework\n") sys.stderr.write("available from http://www-huber.embl.de/users/anders/HTSeq\n") sys.exit(1) gtf_file = args[0] out_file = args[1] aggregateGenes = opts.aggregate == "yes" # Step 1: Store all exons with their gene and transcript ID # in a GenomicArrayOfSets exons = HTSeq.GenomicArrayOfSets("auto", stranded=True) for f in HTSeq.GFF_Reader(gtf_file): if f.type != "exon": continue f.attr['gene_id'] = f.attr['gene_id'].replace(":", "_") exons[f.iv] += (f.attr['gene_id'], f.attr['transcript_id']) # Step 2: Form sets of overlapping genes # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set # contains IDs of genes that overlap, i.e., share bases (on the same strand). # The keys of 'gene_sets' are the IDs of all genes, and each key refers to # the set that contains the gene. # Each gene set forms an 'aggregate gene'. if aggregateGenes == True: gene_sets = collections.defaultdict(lambda: set()) for iv, s in exons.steps(): # For each step, make a set, 'full_set' of all the gene IDs occuring # in the present step, and also add all those gene IDs, whch have been # seen earlier to co-occur with each of the currently present gene IDs. full_set = set() for gene_id, transcript_id in s: full_set.add(gene_id) full_set |= gene_sets[gene_id] # Make sure that all genes that are now in full_set get associated # with full_set, i.e., get to know about their new partners for gene_id in full_set: assert gene_sets[gene_id] <= full_set gene_sets[gene_id] = full_set # Step 3: Go through the steps again to get the exonic sections. Each step # becomes an 'exonic part'. The exonic part is associated with an # aggregate gene, i.e., a gene set as determined in the previous step, # and a transcript set, containing all transcripts that occur in the step. # The results are stored in the dict 'aggregates', which contains, for each # aggregate ID, a list of all its exonic_part features. aggregates = collections.defaultdict(lambda: list()) for iv, s in exons.steps(): # Skip empty steps if len(s) == 0: continue gene_id = list(s)[0][0] ## if aggregateGenes=FALSE, ignore the exons associated to more than one gene ID if aggregateGenes == False: check_set = set() for geneID, transcript_id in s: check_set.add(geneID) if len(check_set) > 1: continue else: aggregate_id = gene_id # Take one of the gene IDs, find the others via gene sets, and # form the aggregate ID from all of them else: assert set(gene_id for gene_id, transcript_id in s) <= gene_sets[gene_id] aggregate_id = '+'.join(gene_sets[gene_id]) # Make the feature and store it in 'aggregates' f = HTSeq.GenomicFeature(aggregate_id, "exonic_part", iv) f.source = os.path.basename(sys.argv[0]) # f.source = "camara" f.attr = {} f.attr['gene_id'] = aggregate_id transcript_set = set((transcript_id for gene_id, transcript_id in s)) f.attr['transcripts'] = '+'.join(transcript_set) aggregates[aggregate_id].append(f) # Step 4: For each aggregate, number the exonic parts aggregate_features = [] for l in aggregates.values(): for i in range(len(l)-1): assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name" assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early" if l[i].iv.chrom != l[i+1].iv.chrom: raise ValueError( "Same name found on two chromosomes: %s, %s" % (str(l[i]), str(l[i+1])) ) if l[i].iv.strand != l[i+1].iv.strand: raise ValueError( "Same name found on two strands: %s, %s" % (str(l[i]), str(l[i+1])) ) aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", HTSeq.GenomicInterval(l[0].iv.chrom, l[0].iv.start, l[-1].iv.end, l[0].iv.strand) ) aggr_feat.source = os.path.basename(sys.argv[0]) aggr_feat.attr = {'gene_id': aggr_feat.name} for i in range(len(l)): l[i].attr['exonic_part_number'] = "%03d" % (i+1) aggregate_features.append(aggr_feat) # Step 5: Sort the aggregates, then write everything out aggregate_features.sort(key=lambda f: (f.iv.chrom, f.iv.start)) fout = open(out_file, "w") for aggr_feat in aggregate_features: fout.write(aggr_feat.get_gff_line()) for f in aggregates[aggr_feat.name]: fout.write(f.get_gff_line()) fout.close() ## modify file to print gtf if featurecounts gtf requested fcountgtf = opts.fcgtf if fcountgtf: os.system('sed s/aggregate_gene/gene/g ' + out_file + ' > ' + fcountgtf) os.system('sed -i s/exonic_part/exon/g ' + fcountgtf) print("Done!") else: print("Done!")
transcript]['gene_id'].iloc[0]) genes = "+".join(genes) if genes not in gene_id.keys(): #if gene appears once f.attr['gene_id'] = genes gene_id[genes] = 1 else: #if gene appears more than once f.attr['gene_id'] = "__00".join([genes, str(gene_id[genes])]) gene_id[genes] += 1 transcipttogenes_id[transcript_id] = f.attr['gene_id'] else: f.attr['gene_id'] = transcipttogenes_id[transcript_id] f.name = f.attr['gene_id'] #Store f as GenomicFeature with new gene_id feat = HTSeq.GenomicFeature( f.name, f.type, HTSeq.GenomicInterval(f.iv.chrom, f.iv.start, f.iv.end, f.iv.strand)) feat.attr = {} feat.attr['gene_id'] = f.attr['gene_id'] if f.type == "exonic_part": feat.attr['transcripts'] = transcript_id feat.attr['exonic_part_number'] = f.attr['exonic_part_number'] all_feat.append(feat) ###Step 5: Sort the aggregates, then write everything out print("=====> WRITING RESULT TO FILE") fout = open(out_file, "w") for feat in all_feat: fout.write(feat.get_gff_line()) fout.close()
def workflow1(file1, test=None): """workflow1: only consider "exon" and "CDS" :param file1: gtf filename :returns: None :rtype: None """ transcripts = generate_transcripts(file1, test) ## * ######################### populate bed files: merge the range ######################### for each_trans, feature_list in transcripts.iteritems(): # each_trans are just tids, using id to maintain order temp = re.split("[:|]",each_trans) chr_gene_id = "%s:%s" % (temp[0],temp[2]) feature_list.sort() # sorted by start_d if feature_list['exon'] == [] and feature_list['CDS'] == []: logging.debug("BAD: {each_trans} was skipped in function workflow1 because of both CDS and exon are missing".format( **locals())) continue ordered_CDS=feature_list['CDS'] if len(ordered_CDS) > 1: for i in range(1, len(ordered_CDS) + 1): ordered_CDS[i-1].name = "{each_trans}|CDS.{}".format(i, each_trans = each_trans) elif len(ordered_CDS) == 1: ordered_CDS[0].name = "%s|CDS" % each_trans ordered_exon=feature_list['exon'] ## special branch with only CDS but no exon if len(ordered_exon) == 0: ordered_exon = ordered_CDS ## change exon name elif len(ordered_exon) > 1: for i in range(1, len(ordered_exon) + 1): ordered_exon[i-1].name = "{each_trans}|exon.{}".format(i, each_trans = each_trans) else: ordered_exon[0].name = "%s|exon" % each_trans ## normal branch, ordered_exon is main player chrom=ordered_exon[0].iv.chrom strand=ordered_exon[0].iv.strand biotype = feature_list.biotype() # add tag things here, just "coding", "non_coding" for now feature_list.tag = "coding" if ordered_CDS != [] else "non_coding" if biotype: biotype = "|%s" % ("_".join(biotype),) # biotype should be unique else: biotype = "" ## _intron if len(ordered_exon) == 2: if strand == "+": intron = construct_iv(chrom, ordered_exon[0].iv.end, ordered_exon[1].iv.start, strand) else: intron = construct_iv(chrom, ordered_exon[1].iv.end, ordered_exon[0].iv.start, strand) feature_list.append( HTSeq.GenomicFeature("{each_trans}|intron".format(each_trans = each_trans), "_intron", intron) ) elif len(ordered_exon) > 2: for i in range(1,len(ordered_exon)): # get all introns if strand == "+": intron = construct_iv(chrom, ordered_exon[i-1].iv.end, ordered_exon[i].iv.start, strand) else: intron = construct_iv(chrom, ordered_exon[i].iv.end, ordered_exon[i-1].iv.start, strand) feature_list.append( HTSeq.GenomicFeature("{each_trans}|intron.{}".format(i, each_trans=each_trans), "_intron", intron)) ## gene if strand == "+": gene = construct_iv(chrom, ordered_exon[0].iv.start, ordered_exon[-1].iv.end, strand) # gene is an iv else: gene = construct_iv(chrom, ordered_exon[-1].iv.start, ordered_exon[0].iv.end, strand) feature_list.append( HTSeq.GenomicFeature("{each_trans}{biotype}".format( each_trans = each_trans, biotype = biotype), "_transcript", gene) ) if len(ordered_CDS): ## _utr5 if ordered_exon[0].iv.start_d != ordered_CDS[0].iv.start_d: if strand == "+": utr5 = construct_iv(chrom, ordered_exon[0].iv.start, ordered_CDS[0].iv.start, strand) # temporary else: if ordered_CDS[0].iv.end > ordered_exon[0].iv.end: continue utr5 = construct_iv(chrom, ordered_CDS[0].iv.end, ordered_exon[0].iv.end, strand) prime5_exons = [x.iv for x in ordered_exon if x.iv.overlaps(utr5)] if len(prime5_exons) == 1: feature_list.append( HTSeq.GenomicFeature( "{each_trans}|utr5".format(each_trans=each_trans), "_utr5", utr5)) else: for i in range(1, len(prime5_exons) + 1): if i == len(prime5_exons): if strand == "+": i_utr5 = construct_iv(chrom, prime5_exons[-1].start, utr5.end, strand) else: i_utr5 = construct_iv(chrom, utr5.start, prime5_exons[-1].end, strand) else: i_utr5 = prime5_exons[i-1] feature_list.append( HTSeq.GenomicFeature( "{each_trans}|utr5.{}".format(i, each_trans=each_trans), "_utr5", i_utr5)) ## stop_codon and utr3 if ordered_CDS[-1].iv.end_d != ordered_exon[-1].iv.end_d: ## get utr3 if feature_list["stop_codon"] != []: if abs(ordered_CDS[-1].iv.end_d - ordered_exon[-1].iv.end_d) != 3: if strand == "+": utr3 = construct_iv(chrom, ordered_CDS[-1].iv.end + 3, ordered_exon[-1].iv.end, strand) else: utr3 = construct_iv(chrom, ordered_exon[-1].iv.start, ordered_CDS[-1].iv.start - 3, strand) else: utr3 = False # this is important else: if strand == "+": if ordered_CDS[-1].iv.end > ordered_exon[-1].iv.end: continue utr3 = construct_iv(chrom, ordered_CDS[-1].iv.end, ordered_exon[-1].iv.end, strand) else: utr3 = construct_iv(chrom, ordered_exon[-1].iv.start, ordered_CDS[-1].iv.start, strand) if utr3: # 有可能没有utr3 prime3_exons = [x.iv for x in feature_list['exon'] if x.iv.overlaps(utr3)] if len(prime3_exons) == 1: feature_list.append( HTSeq.GenomicFeature( "{each_trans}|utr3".format(each_trans=each_trans), "_utr3", utr3 ) ) else: for i in range(1, len(prime3_exons) + 1): if i == 1: if strand == "+": i_utr3 = construct_iv(chrom, utr3.start, prime3_exons[0].end, strand) else: i_utr3 = construct_iv(chrom, prime3_exons[0].start, utr3.end, strand) else: i_utr3 = prime3_exons[i-1] feature_list.append( HTSeq.GenomicFeature( "{each_trans}|utr3.{}".format(i, each_trans=each_trans), "_utr3", i_utr3 ) ) ## promoter if strand == "+": ps = ordered_exon[0].iv.start - args.pl # promoter start position if ps <0: ps = 0 promoter=construct_iv(chrom, ps, ordered_exon[0].iv.start, strand) else: # if strand == "-" ps = ordered_exon[0].iv.end + args.pl # if chrom == 'scaffold_42': ##only for debug # exit( '%s ' % ordered_exon[0].name) if ps > chr_lengths[chrom]: ps = chr_lengths[chrom] if ordered_exon[0].iv.end > ps: print ( 'skipping:%s, %s, %s, %s' % (chrom, ordered_exon[0].iv.end, ps, strand), file=sys.stderr ) continue promoter=construct_iv(chrom, ordered_exon[0].iv.end, ps, strand) feature_list.append( HTSeq.GenomicFeature( "{each_trans}|promoter".format(each_trans=each_trans), "_promoter", promoter ) ) ## record genes[chr_gene_id].append(gene) # using chr_gene_id: "chr:gene_id" ## create folders ## write to file for each_trans, feature_list in transcripts.iteritems(): feature_list.report() if not args.no_whole_report: feature_list.whole_report() ## close all fhandlers for x in fhandlers: try: x.close() except: print("file handler %s cannot be closed, maybe you have closed it?" % x) pass if test: pdb.set_trace() ## * ################### try to output intergenic region ################### intergenic_f = open("%s/intergenic.bed" % outdir, "w") outer_genes = OrderedDict() for chr_gene_id, gs in genes.iteritems(): chr_id = chr_gene_id.split(":")[0] gene_id = chr_gene_id.split(":")[1] if chr_id not in outer_genes: outer_genes[chr_id] = {} if len(gs) > 1: gene = construct_iv(gs[0].chrom, min([x.start for x in gs]), max([x.end for x in gs]), gs[0].strand) # combine multiple transcripts of the same gene outer_genes[chr_id][gene_id] = gene else: outer_genes[chr_id][gene_id] = gs[0] for chr_id in outer_genes: flag = 0 former_g = "" # gene before interval next_g = "" # gene after interval for gene_id in sorted(outer_genes[chr_id].keys(), key=lambda x: outer_genes[chr_id][x].start): # super-low efficiency chr_gene_id = "%s:%s" %(chr_id, gene_id) gene = outer_genes[chr_id][gene_id] e = gene.end s = gene.start assert s < e, (chr_gene_id, gene, s, e) if flag == s: # does this can happen? flag = e former_g = chr_gene_id continue elif flag < s: # normal stuff next_g = chr_gene_id _id = former_g + "--" + next_g inter_g = construct_iv(gene.chrom, flag, s, "+") intergenic_f.write(str_iv2(inter_g, _id)) flag = e former_g = chr_gene_id continue elif flag >= e: logging.debug("Former({}) may overlap with gene_id({})".format(former_g, gene_id)) continue elif flag < e: flag = e former_g = chr_gene_id continue next_g = "" _id = former_g + "--" + next_g logging.debug("{0}({1}) has start of {flag}, end of {2}".format(gene.chrom, chr_id, chr_lengths[chr_id], flag=flag)) inter_g = construct_iv(gene.chrom, flag, chr_lengths[chr_id], "+") logging.debug("OUTPUT last interval, former is %s" % former_g) intergenic_f.write(str_iv2(inter_g, _id)) intergenic_f.close()
# aggregate ID, a list of all its exonic_part features. aggregates = collections.defaultdict(lambda: list()) i = -1 for iv, s in exons.steps(): # Skip omitted steps i += 1 if exon_omit_list[i]: continue # Take one of the gene IDs, find the others via gene sets, and # form the aggregate ID from all of them gene_id = list(s)[0][0] assert set(gene_id for gene_id, transcript_id in s) <= gene_sets[gene_id] aggregate_id = '+'.join(gene_sets[gene_id]) # Make the feature and store it in 'aggregates' f = HTSeq.GenomicFeature(aggregate_id, "exonic_part", iv) f.source = os.path.basename(sys.argv[1]) f.attr = {} f.attr['gene_id'] = aggregate_id transcript_set = set((transcript_id for gene_id, transcript_id in s)) f.attr['transcripts'] = '+'.join(transcript_set) aggregates[aggregate_id].append(f) # Step 4: For each aggregate, number the exonic parts aggregate_features = [] for l in aggregates.values(): name = l[0].name chrom = l[0].iv.chrom strand = l[0].iv.strand start = l[0].iv.start
def create_sliding_gene_window_GTF(path_annot, annotation_file): ''' Prepare window file in GTF for Peak detection Every overlapping window of 100bp on every gene is calculated :return: ''' print('Create ' + annotation_file + '.gene.slidingwindows.gtf file for Peak detection') print('Every overlapping window of 100bp on every gene is calculated') overlap = WINDOW_SIZE / 2 gtf_file = HTSeq.GFF_Reader(path_annot + '/' + annotation_file + '.annotation.gene.gtf', end_included=True) windows = HTSeq.GenomicArrayOfSets("auto", stranded=True) transcriptID = 1 with open(path_annot + '/' + annotation_file + '.gene.slidingwindows.gtf', "w") as slidingGTF: for feature in gtf_file: interval = feature.iv transcriptID += 1 if transcriptID % 10000 == 0: print('Gene: ' + str(transcriptID) + '/ 100000') windowID = 1 if interval.strand == '+': begin = interval.start_d end = begin + WINDOW_SIZE while end < interval.end_d: window = HTSeq.GenomicInterval(interval.chrom, begin, end + 1, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['gene_id'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) windowID += 1 #print(featureWindow.get_gff_line()) begin += overlap end = begin + WINDOW_SIZE slidingGTF.write(featureWindow.get_gff_line()) end = interval.end_d window = HTSeq.GenomicInterval(interval.chrom, begin, end, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['gene_id'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) slidingGTF.write(featureWindow.get_gff_line()) else: #print(str(interval.end_d) + ' ' + str(interval.start_d)) begin = interval.start_d end = begin - WINDOW_SIZE while end > interval.end_d: window = HTSeq.GenomicInterval(interval.chrom, end, begin + 1, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['gene_id'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) windowID += 1 #print(featureWindow.get_gff_line()) begin -= overlap end = begin - WINDOW_SIZE slidingGTF.write(featureWindow.get_gff_line()) end = interval.end_d window = HTSeq.GenomicInterval(interval.chrom, end + 1, begin, interval.strand) featureWindow = HTSeq.GenomicFeature( feature.attr['gene_id'] + '_window_' + feature.attr['gene_type'] + '_' + str(windowID), 'window', window) slidingGTF.write(featureWindow.get_gff_line())