Esempio n. 1
0
    def get_features(self, cluster_number):
        left_iv = self.left.iv.copy()
        left_iv.strand = "+"
        left_feature = HTSeq.GenomicFeature(
            "cluster_{0}_left".format(cluster_number), CLUSTER_GFF_TYPE,
            left_iv)
        left_feature.score = self.left.count

        right_iv = self.right.iv.copy()
        right_iv.strand = "-"
        right_feature = HTSeq.GenomicFeature(
            "cluster_{0}_right".format(cluster_number), CLUSTER_GFF_TYPE,
            right_iv)
        right_feature.score = self.right.count

        if self.gap == 0:
            gap_iv = HTSeq.GenomicInterval(left_iv.chrom, left_iv.end - 1,
                                           right_iv.start + 1, ".")
        else:
            # gap_iv = HTSeq.GenomicInterval( left_iv.chrom, left_iv.end+1, right_iv.start-1, "." )
            gap_iv = HTSeq.GenomicInterval(left_iv.chrom, left_iv.end,
                                           right_iv.start, ".")
        insert_name = "cluster_{0}_insert".format(cluster_number)
        insert_feature = HTSeq.GenomicFeature(insert_name, GAP_GFF_TYPE,
                                              gap_iv)
        # print dir(insert_feature)
        # insert_feature.__setattribute__("length",self.gap)
        insert_feature.attr = {"ID": insert_name, "length": self.gap}

        return (left_feature, insert_feature, right_feature)
Esempio n. 2
0
    def get_features(self, cluster_number):
        singleton_iv = self.singleton.iv.copy()
        insert_iv = None
        if self.singleton.insertion_side == RIGHT:
            singleton_iv.strand = "+"
            # insert_iv = HTSeq.GenomicInterval(singleton_iv.chrom, singleton_iv.end+1, singleton_iv.end+1, "." )
            insert_iv = HTSeq.GenomicInterval(singleton_iv.chrom,
                                              singleton_iv.end,
                                              singleton_iv.end + 1, ".")
        elif self.singleton.insertion_side == LEFT:
            singleton_iv.strand = "-"
            # insert_iv =HTSeq.GenomicInterval(singleton_iv.chrom, singleton_iv.start-1, singleton_iv.start-1, "." )
            insert_iv = HTSeq.GenomicInterval(singleton_iv.chrom,
                                              singleton_iv.start - 1,
                                              singleton_iv.start, ".")
        else:
            singleton_iv.strand = "."

        singleton_feature = HTSeq.GenomicFeature(
            "cluster_{0}_singleton".format(cluster_number), CLUSTER_GFF_TYPE,
            singleton_iv)
        singleton_feature.score = self.singleton.count
        feature_list = [singleton_feature]

        if insert_iv != None:
            insert_feature = HTSeq.GenomicFeature(
                "cluster_{0}_junction".format(cluster_number),
                SINGLETON_INSERT_GFF_TYPE, insert_iv)
            feature_list.append(insert_feature)

        return feature_list
Esempio n. 3
0
def collapseSortedGF(ll):

    gf_new = None
    l_out = []
    i = 0

    # loop through list
    for feature in ll:

        if gf_new is None:
            #gf_new = feature
            gf_new = HTSeq.GenomicFeature(feature.name, feature.type,
                                          feature.iv.copy())
            gf_new.attr = feature.attr.copy()
        else:
            if feature.iv.overlaps(gf_new.iv):
                # features overlap, merge them
                gfMerge(gf_new, feature)
            else:
                # features don't overlap so append the completed "new" feature
                # to the output list
                l_out.append(gf_new)
                # start new "new" feature
                #gf_new = feature
                gf_new = HTSeq.GenomicFeature(feature.name, feature.type,
                                              feature.iv.copy())
                gf_new.attr = feature.attr.copy()

    # append final "new" feature
    l_out.append(gf_new)

    return l_out
Esempio n. 4
0
def create_peak_gtf(path, exp_design_name, technique, bed_name):
    """
    Read all PATH_PEAKS+'/'+exp_design_name+'_'+technique+'_'+Final.txt
    Combine peaks
    and save to GFF
    :param list_technique:
    :return:
    """
    PATH_ANNOT = path + '/Genome/'
    if technique == '' or technique == 'All':
        PATH_PEAKS = path + '/PeakDetection/Peaks'
        peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '_Peaks.txt'
        gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '.gtf'
    else:
        PATH_PEAKS = path + '/PeakDetection/' + technique + '/'
        peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '_Peaks.txt'
        gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '.gtf'

    with open(gtf_filename, 'w') as gtf_file, \
            open(peak_filename, 'rU') as peak_file:
        csv_peaks = csv.DictReader(peak_file, delimiter='\t')
        for row in csv_peaks:
            peak = HTSeq.GenomicInterval(row['chromo_peak'],
                                         int(row['begin_peak']),
                                         int(row['end_peak']), ".")
            peak_id = row['WindowId']
            feature = HTSeq.GenomicFeature(peak_id, 'exon', peak)
            #print(feature.get_gff_line().strip() + '; gene_id \"'+peak_id+'\"')
            gtf_file.write(feature.get_gff_line().strip() + '; gene_id \"' +
                           peak_id + '\"' + '\n')
Esempio n. 5
0
def parseGtf(sz_file, d_g):

    ##
    # variables
    szTid = ""

    # create reader for GTF
    gr = HTSeq.GFF_Reader(sz_file)

    # read through the GTF file and load into dict
    for feature in gr:
        if feature.type == "exon":
            szTid = feature.attr['transcript_id']
            feature.name = szTid

            # add to dict
            if szTid not in d_g:
                d_g[szTid] = {'exons': [], 'feature': None}

            # append current feature to transcript's exon list
            # kill strand
            feature.iv.strand = "."
            d_g[szTid]['exons'].append(feature)

            # update feature's genomic interval
            if d_g[szTid]['feature'] is None:
                d_g[szTid]['feature'] = HTSeq.GenomicFeature(
                    feature.name, "gene", feature.iv.copy())
                d_g[szTid]['feature'].attr = feature.attr.copy()
            else:
                d_g[szTid]['feature'].iv.start = min(
                    d_g[szTid]['feature'].iv.start, feature.iv.start)
                d_g[szTid]['feature'].iv.end = max(
                    d_g[szTid]['feature'].iv.end, feature.iv.end)
Esempio n. 6
0
    def _parse_gtf_features(self, transcript_id):
        self.feature_set = set()

        names = ['chrom','source','feature','start','end','score','strand','frame', 'attribute']
        gtf_data = pd.read_table('chr1.gtf', sep='\t', comment='#', names=names)
        print 'done reading'
        transcript_series = gtf_data['attribute'].apply(self._filter_enst, args=(transcript_id,))
        gtf_data = gtf_data.ix[transcript_series]
        for r in gtf_data.iterrows():
            r = r[1] # iterrows is a tuple of (index, Series)
            self.feature_set.add(HTSeq.GenomicFeature(transcript_id, r.feature, HTSeq.GenomicInterval(r.chrom, r.start, r.end, r.strand)))
    if aggregateGenes == False:
        check_set = set()
        for geneID, transcript_id in s:
            check_set.add(geneID)
        if (len(check_set) > 1):
            continue
        else:
            aggregate_id = gene_id
    # Take one of the gene IDs, find the others via gene sets, and
    # form the aggregate ID from all of them
    else:
        assert set(gene_id
                   for gene_id, transcript_id in s) <= gene_sets[gene_id]
        aggregate_id = '+'.join(gene_sets[gene_id])
    # Make the feature and store it in 'aggregates'
    f = HTSeq.GenomicFeature(aggregate_id, "exonic_part", iv)
    f.source = os.path.basename(sys.argv[0])
    #   f.source = "camara"
    f.attr = {}
    f.attr['gene_id'] = aggregate_id
    transcript_set = set((transcript_id for gene_id, transcript_id in s))
    f.attr['transcripts'] = '+'.join(transcript_set)
    aggregates[aggregate_id].append(f)

# Step 4: For each aggregate, number the exonic parts

aggregate_features = []
for l in aggregates.values():
    for i in range(len(l) - 1):
        assert l[i].name == l[i + 1].name, str(l[i + 1]) + " has wrong name"
        assert l[i].iv.end <= l[i + 1].iv.start, str(
Esempio n. 8
0
#Annotate promoters
promoters = []
for iv, s in promoter_part.steps():
    if len(s) == 0:
        continue
    if iv.strand == "+":
        new_iv = HTSeq.GenomicInterval(iv.chrom, iv.start - 2000, iv.start,
                                       iv.strand)
    else:
        new_iv = HTSeq.GenomicInterval(iv.chrom, iv.end, iv.end + 2000,
                                       iv.strand)

    for g_id in s:
        gene_id = g_id
    promoter = HTSeq.GenomicFeature(gene_id, "promoter", new_iv)
    promoter.attr = {'gene_id': gene_id}
    promoters.append(promoter)
promoters.sort(key=lambda promoter: (promoter.iv.chrom, promoter.iv.start))
fout = open(promoter_file, "w")
for promoter in promoters:
    fout.write(promoter.get_gff_line())
fout.close()

#Annotate introns
introns = []
for iv, s in intron_part.steps():
    if len(s) == 0:
        continue
    iv = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end, iv.strand)
    for g_id in s:
Esempio n. 9
0
def run(args):
    exons = collections.defaultdict(
        lambda: HTSeq.GenomicArrayOfSets("auto", stranded=True))
    gene_region = {}
    gene_region_length = collections.Counter()
    transcript_region = collections.defaultdict(lambda: dict())
    start_codon_region = collections.defaultdict(lambda: dict())
    stop_codon_region = collections.defaultdict(lambda: dict())

    CDS_region = collections.defaultdict(lambda: dict())
    five_UTR_region = collections.defaultdict(lambda: dict())
    three_UTR_region = collections.defaultdict(lambda: dict())

    # Read features from the input GTF file.
    gtffile = HTSeq.GFF_Reader(args.inputfile, end_included=True)
    gtffile = filter(
        lambda feature: re.search(r'chr[a-zA-Z0-9]+$', feature.iv.chrom),
        gtffile)
    bad_gene_list = find_bad_genes(gtffile)
    logging.info(
        "Removing genes with exons in different chromosomes or strands (%i discarded)"
        % len(bad_gene_list))

    gtffile = filter(
        lambda feature: feature.attr['gene_id'] not in bad_gene_list, gtffile)
    for feature in gtffile:
        if feature.type == "exon":
            gene_id = feature.attr["gene_id"]
            exons[gene_id][feature.iv] += feature.attr["transcript_id"]
            extend_transcript_region(feature, transcript_region)
        elif feature.type == "start_codon":
            gene_id = feature.attr["gene_id"]
            transcript_id = feature.attr["transcript_id"]
            start_codon_region[gene_id][transcript_id] = feature.iv
        elif feature.type == "stop_codon":
            gene_id = feature.attr["gene_id"]
            transcript_id = feature.attr["transcript_id"]
            stop_codon_region[gene_id][transcript_id] = feature.iv

    gene_region = find_gene_region(transcript_region)
    gene_region_length = find_gene_region_length(gene_region,
                                                 transcript_region)
    (CDS_region, five_UTR_region,
     three_UTR_region) = find_CDS_and_UTR_region(start_codon_region,
                                                 stop_codon_region,
                                                 transcript_region)

    introns = collections.defaultdict(
        lambda: HTSeq.GenomicArrayOfSets("auto", stranded=True))
    for gene_id in transcript_region.keys():
        for transcript_id in transcript_region[gene_id].keys():
            transcript_iv = transcript_region[gene_id][transcript_id]
            for iv, step_set in exons[gene_id][transcript_iv].steps():
                if transcript_id not in step_set:
                    introns[gene_id][iv] += transcript_id

    # gene_exons_bins redefines the exons in one gene. All exons in the gene region are split into exon bins.
    # Each exon bin is a feature (feature type is "exonic_region"), which has attributes: "gene_id" and "transcripts".
    # One exon bin is possibly shared by multiple transcripts.
    gene_exons_bins = collections.defaultdict(lambda: list())

    for gene_id in gene_region.keys():
        gene_iv = gene_region[gene_id]
        for iv, step_set in exons[gene_id][gene_iv].steps():
            transcript_list = list(step_set)
            if len(transcript_list) != 0:
                feature = HTSeq.GenomicFeature(gene_id, "exonic_region", iv)
                feature.source = "IR_annotation"
                feature.attr = {}
                feature.attr["gene_id"] = gene_id
                feature.attr["transcripts"] = "+".join(transcript_list)
                gene_exons_bins[gene_id].append(feature)
        if gene_iv.strand == "-":
            gene_exons_bins[gene_id] = gene_exons_bins[gene_id][::-1]

    # Number the exon bins with attrubute "exonic_region_number" starting from "001".
    for exons_bins_list in gene_exons_bins.values():
        for i in xrange(len(exons_bins_list)):
            exons_bins_list[i].attr["exonic_region_number"] = "%03d" % (i + 1)

    # gene_introns_bins redefines the introns in one gene. All introns in the gene region are split into intron bins.
    # Each intron bin is a feature (feature type is "intronic_region"), which has attributes: "gene_id" and "transcripts".
    # One intron bin is possibly shared by multiple transcripts. If it isn't shared by one transcript, this intron bin
    # either overlaps with the exonic region of that transcript or lies outside of the whole region of that transcript.
    gene_introns_bins = collections.defaultdict(lambda: list())

    for gene_id in gene_region.keys():
        gene_iv = gene_region[gene_id]
        for iv, step_set in introns[gene_id][gene_iv].steps():
            transcript_list = list(step_set)
            if len(transcript_list) != 0:
                feature = HTSeq.GenomicFeature(gene_id, "intronic_region", iv)
                feature.source = "IR_annotation"
                feature.attr = {}
                feature.attr["gene_id"] = gene_id
                feature.attr["transcripts"] = "+".join(transcript_list)
                gene_introns_bins[gene_id].append(feature)
        if gene_iv.strand == "-":
            gene_introns_bins[gene_id] = gene_introns_bins[gene_id][::-1]

    # Number the intron bins with attrubute "intronic_region_number" starting from "001".
    for introns_bins_list in gene_introns_bins.values():
        for i in xrange(len(introns_bins_list)):
            introns_bins_list[i].attr["intronic_region_number"] = "%03d" % (i +
                                                                            1)

    # gene_constitutive_exons_bins defines that kind of exon bins that shared by all the transcripts in one gene.
    # Each constitutive exon bin is a feature (feature type is "constitutive_exonic_region"), which has attribute: "gene_id".
    logging.info("Generating constitutive exonic region (CER) annotation")

    gene_constitutive_exons_bins = collections.defaultdict(lambda: list())
    gene_constitutive_exons_start_d = collections.defaultdict(lambda: set())
    gene_constitutive_exons_end_d = collections.defaultdict(lambda: set())
    gene_constitutive_exons_length = collections.Counter()
    gene_constitutive_exons_number = collections.Counter()

    for gene_id in gene_region.keys():
        transcripts_in_gene = len(transcript_region[gene_id])
        gene_iv = gene_region[gene_id]
        for iv, step_set in exons[gene_id][gene_iv].steps():
            transcript_list = list(step_set)
            if len(transcript_list) == transcripts_in_gene:
                feature = HTSeq.GenomicFeature(gene_id,
                                               "constitutive_exonic_region",
                                               iv)
                feature.source = "IR_annotation"
                feature.attr = {}
                feature.attr["gene_id"] = gene_id
                gene_constitutive_exons_bins[gene_id].append(feature)
                gene_constitutive_exons_start_d[gene_id].add(
                    feature.iv.start_d_as_pos)
                gene_constitutive_exons_end_d[gene_id].add(
                    feature.iv.end_d_as_pos)
                gene_constitutive_exons_length[gene_id] += feature.iv.length
                gene_constitutive_exons_number[gene_id] += 1
        if gene_iv.strand == "-":
            gene_constitutive_exons_bins[
                gene_id] = gene_constitutive_exons_bins[gene_id][::-1]

    for constitutive_exons_bins_list in gene_constitutive_exons_bins.values():
        for i in xrange(len(constitutive_exons_bins_list)):
            constitutive_exons_bins_list[i].attr[
                "constitutive_exonic_region_number"] = "%03d" % (i + 1)

    # gene_constitutive_introns_bins defines that kind of intron bins that shared by all the transcripts in one gene.
    # Each constitutive intron bin is a feature (feature type is "constitutive_intronic_region"), which has attribute: "gene_id".
    # For those intron bins in single transcript gene, if the intron bin is in 5' UTR, it will have attribute: "five_UTR_constitutive_intron";
    # If the intron bin is in 3' UTR, it will have attribute: "three_UTR_constitutive_intron".
    # Didn't define "five_UTR_constitutive_intron" or "three_UTR_constitutive_intron" for intron bins in multiple transcripts gene yet.
    logging.info("Generating constitutive intronic region (CIR) annotation")

    gene_constitutive_introns_bins = collections.defaultdict(lambda: list())
    gene_constitutive_introns_start_d = collections.defaultdict(lambda: set())
    gene_constitutive_introns_end_d = collections.defaultdict(lambda: set())
    gene_constitutive_introns_length = collections.Counter()
    gene_constitutive_introns_number = collections.Counter()

    for gene_id in gene_region.keys():
        transcripts_in_gene = len(transcript_region[gene_id])
        gene_iv = gene_region[gene_id]
        exist_UTR_regions = False
        if transcripts_in_gene == 1 and gene_id in start_codon_region.keys():
            assert len(start_codon_region[gene_id]) == len(
                stop_codon_region[gene_id]) == 1
            transcript_id = start_codon_region[gene_id].keys()[0]
            start_codon_region_iv = start_codon_region[gene_id].values()[0]
            stop_codon_region_iv = stop_codon_region[gene_id].values()[0]
            (five_UTR_region_iv, three_UTR_region_iv) = find_UTR_region_iv(
                start_codon_region_iv, stop_codon_region_iv,
                transcript_region[gene_id][transcript_id])
            exist_UTR_regions = True
        for iv, step_set in introns[gene_id][gene_iv].steps():
            transcript_list = list(step_set)
            if len(transcript_list) == transcripts_in_gene:
                feature = HTSeq.GenomicFeature(gene_id,
                                               "constitutive_intronic_region",
                                               iv)
                feature.source = "IR_annotation"
                feature.attr = {}
                feature.attr["gene_id"] = gene_id
                if exist_UTR_regions == True:
                    if feature.iv.is_contained_in(five_UTR_region_iv):
                        feature.attr[
                            "five_UTR_constitutive_intron"] = "five_UTR_constitutive_intron"
                    elif feature.iv.is_contained_in(three_UTR_region_iv):
                        feature.attr[
                            "three_UTR_constitutive_intron"] = "three_UTR_constitutive_intron"
                gene_constitutive_introns_bins[gene_id].append(feature)
                gene_constitutive_introns_start_d[gene_id].add(
                    feature.iv.start_d_as_pos)
                gene_constitutive_introns_end_d[gene_id].add(
                    feature.iv.end_d_as_pos)
                gene_constitutive_introns_length[gene_id] += feature.iv.length
                gene_constitutive_introns_number[gene_id] += 1
        if gene_iv.strand == "-":
            gene_constitutive_introns_bins[
                gene_id] = gene_constitutive_introns_bins[gene_id][::-1]

    five_UTR_constitutive_introns = collections.defaultdict(lambda: list())
    three_UTR_constitutive_introns = collections.defaultdict(lambda: list())

    for constitutive_introns_bins_list in gene_constitutive_introns_bins.values(
    ):
        for i in xrange(len(constitutive_introns_bins_list)):
            gene_id = constitutive_introns_bins_list[i].attr["gene_id"]
            constitutive_intronic_region_number = constitutive_introns_bins_list[
                i].attr["constitutive_intronic_region_number"] = "%03d" % (i +
                                                                           1)
            if "five_UTR_constitutive_intron" in constitutive_introns_bins_list[
                    i].attr.keys():
                five_UTR_constitutive_introns[gene_id].append(
                    constitutive_intronic_region_number)
            elif "three_UTR_constitutive_intron" in constitutive_introns_bins_list[
                    i].attr.keys():
                three_UTR_constitutive_introns[gene_id].append(
                    constitutive_intronic_region_number)

    # gene_constitutive_junction defines that kind of junction positions that join constitutive exon bin and constitutive intron bin in one gene.
    # Each constitutive junction is a feature (feature type is "constitutive_junction"), which has attributes: "gene_id", "constitutive_junction_type", "upstream", "downstream".
    # attr["constitutive_junction_type"] can be the value: "5'_splice_junction", which means the upstream of the junction position is a constitutive exon bin,
    # and the downstream of the junction position is a constitutive intron bin. In this case, attr["upstream"] will be like "constitutive_exonic_region_number 002" (shows exactly which
    # constitutive exon bin in the upstream), and similarly for attr["downstream"].
    # On the other hand, attr["constitutive_junction_type"] can be the value: "3'_splice_junction"
    logging.info("Generating constitutive junctions (CJ) annotation")

    gene_constitutive_junction = collections.defaultdict(lambda: list())
    for gene_id in gene_constitutive_exons_start_d.keys():
        if gene_id in gene_constitutive_introns_start_d.keys():
            gene_constitutive_junction_from_exon_to_intron_set = gene_constitutive_exons_end_d[
                gene_id] & gene_constitutive_introns_start_d[gene_id]
            for gene_constitutive_junction_pos in gene_constitutive_junction_from_exon_to_intron_set:
                feature = HTSeq.GenomicFeature(gene_id,
                                               "constitutive_junction",
                                               gene_constitutive_junction_pos)
                feature.source = "IR_annotation"
                feature.attr = {}
                feature.attr["gene_id"] = gene_id
                from_region_number = find_region_number(
                    gene_constitutive_junction_pos,
                    gene_constitutive_exons_bins[gene_id], "end_d_as_pos")
                feature.attr[
                    "upstream"] = "constitutive_exonic_region_number " + from_region_number
                to_region_number, index = find_region_number_and_index(
                    gene_constitutive_junction_pos,
                    gene_constitutive_introns_bins[gene_id], "start_d_as_pos")
                feature.attr[
                    "downstream"] = "constitutive_intronic_region_number " + to_region_number
                feature.attr[
                    "constitutive_junction_type"] = "5'_splice_junction"
                gene_constitutive_junction[gene_id].append((feature, index))

            gene_constitutive_junction_from_intron_to_exon_set = gene_constitutive_exons_start_d[
                gene_id] & gene_constitutive_introns_end_d[gene_id]
            for gene_constitutive_junction_pos in gene_constitutive_junction_from_intron_to_exon_set:
                feature = HTSeq.GenomicFeature(gene_id,
                                               "constitutive_junction",
                                               gene_constitutive_junction_pos)
                feature.source = "IR_annotation"
                feature.attr = {}
                feature.attr["gene_id"] = gene_id
                from_region_number, index = find_region_number_and_index(
                    gene_constitutive_junction_pos,
                    gene_constitutive_introns_bins[gene_id], "end_d_as_pos")
                feature.attr[
                    "upstream"] = "constitutive_intronic_region_number " + from_region_number
                to_region_number = find_region_number(
                    gene_constitutive_junction_pos,
                    gene_constitutive_exons_bins[gene_id], "start_d_as_pos")
                feature.attr[
                    "downstream"] = "constitutive_exonic_region_number " + to_region_number
                feature.attr[
                    "constitutive_junction_type"] = "3'_splice_junction"
                gene_constitutive_junction[gene_id].append((feature, index))

        if len(gene_constitutive_junction[gene_id]) > 0:
            gene_strand = gene_constitutive_junction[gene_id][0][0].iv.strand
            if gene_strand == "+":
                gene_constitutive_junction[gene_id].sort(
                    key=lambda f: (f[0].iv.chrom, f[0].iv.start))
            else:
                gene_constitutive_junction[gene_id].sort(
                    key=lambda f: (f[0].iv.chrom, -f[0].iv.start))

    for gene_constitutive_junction_list in gene_constitutive_junction.values():
        for i in xrange(len(gene_constitutive_junction_list)):
            gene_constitutive_junction_list[i][0].attr[
                "constitutive_junction_number"] = "%03d" % (i + 1)

            feature = gene_constitutive_junction_list[i][0]
            gene_id = feature.attr["gene_id"]
            constitutive_junction_type = feature.attr[
                "constitutive_junction_type"]
            if constitutive_junction_type == "5'_splice_junction":
                index = gene_constitutive_junction_list[i][1]
                gene_constitutive_introns_bins[gene_id][index].attr[
                    "upstream_constitutive_junction_number"] = "%03d" % (i + 1)
            elif constitutive_junction_type == "3'_splice_junction":
                index = gene_constitutive_junction_list[i][1]
                gene_constitutive_introns_bins[gene_id][index].attr[
                    "downstream_constitutive_junction_number"] = "%03d" % (i +
                                                                           1)

    # gene_region_features defines a feature for each gene that summarize some key info. feature iv is the gene region.
    # Feature type is "gene_region". Each feature has attributes: "gene_id", "transcripts_in_gene" (count how many transcripts in this gene),
    # "gene_region_length", "constitutive_exonic_region_length", "constitutive_intronic_region_length".
    # For single transcript gene, if any of constitutive intron bins in 5' UTR, it will have attribute "five_UTR_constitutive_introns"
    # (value would be like "001,002", list the region numbers); similarly for 3' UTR.
    gene_region_features = []
    for gene_id in gene_region.keys():
        iv = gene_region[gene_id]
        feature = HTSeq.GenomicFeature(gene_id, "gene_region", iv)
        feature.source = "IR_annotation"
        feature.attr = {}
        feature.attr["gene_id"] = gene_id
        feature.attr["transcripts_in_gene"] = len(transcript_region[gene_id])
        feature.attr["gene_region_length"] = gene_region_length[gene_id]
        feature.attr[
            "constitutive_exonic_region_length"] = gene_constitutive_exons_length[
                gene_id]
        feature.attr[
            "constitutive_intronic_region_length"] = gene_constitutive_introns_length[
                gene_id]
        feature.attr[
            "constitutive_exonic_region_number"] = gene_constitutive_exons_number[
                gene_id]
        feature.attr[
            "constitutive_intronic_region_number"] = gene_constitutive_introns_number[
                gene_id]
        if gene_id in five_UTR_constitutive_introns.keys():
            feature.attr["five_UTR_constitutive_introns"] = ",".join(
                five_UTR_constitutive_introns[gene_id])
        if gene_id in three_UTR_constitutive_introns.keys():
            feature.attr["three_UTR_constitutive_introns"] = ",".join(
                three_UTR_constitutive_introns[gene_id])
        gene_region_features.append(feature)

    gene_region_features.sort(key=lambda f: (f.iv.chrom, f.iv.start))

    # transcript_region_features defines a feature for each transcript. feature iv is the transcript region.
    # Feature type is "transcript_region". Each feature has attributes: "gene_id", "transcript_id".
    transcript_region_features = collections.defaultdict(lambda: list())
    for gene_id in transcript_region.keys():
        for transcript_id in transcript_region[gene_id].keys():
            iv = transcript_region[gene_id][transcript_id]
            feature = HTSeq.GenomicFeature(gene_id, "transcript_region", iv)
            feature.source = "IR_annotation"
            feature.attr = {}
            feature.attr["gene_id"] = gene_id
            feature.attr["transcript_id"] = transcript_id
            transcript_region_features[gene_id].append(feature)

    CDS_region_features = collections.defaultdict(lambda: list())
    for gene_id in CDS_region.keys():
        for transcript_id in CDS_region[gene_id].keys():
            iv = CDS_region[gene_id][transcript_id]
            feature = HTSeq.GenomicFeature(gene_id, "CDS_region", iv)
            feature.source = "IR_annotation"
            feature.attr = {}
            feature.attr["gene_id"] = gene_id
            feature.attr["transcript_id"] = transcript_id
            CDS_region_features[gene_id].append(feature)

    five_UTR_region_features = collections.defaultdict(lambda: list())
    for gene_id in five_UTR_region.keys():
        for transcript_id in five_UTR_region[gene_id].keys():
            iv = five_UTR_region[gene_id][transcript_id]
            feature = HTSeq.GenomicFeature(gene_id, "five_UTR_region", iv)
            feature.source = "IR_annotation"
            feature.attr = {}
            feature.attr["gene_id"] = gene_id
            feature.attr["transcript_id"] = transcript_id
            five_UTR_region_features[gene_id].append(feature)

    three_UTR_region_features = collections.defaultdict(lambda: list())
    for gene_id in three_UTR_region.keys():
        for transcript_id in three_UTR_region[gene_id].keys():
            iv = three_UTR_region[gene_id][transcript_id]
            feature = HTSeq.GenomicFeature(gene_id, "three_UTR_region", iv)
            feature.source = "IR_annotation"
            feature.attr = {}
            feature.attr["gene_id"] = gene_id
            feature.attr["transcript_id"] = transcript_id
            three_UTR_region_features[gene_id].append(feature)

    # Write all newly defined features into new gtf annotation file.
    logging.info("Writing annotation to file: %s" %
                 os.path.join(args.outdir, args.annofile))

    f = open(os.path.join(args.outdir, args.annofile), "w")
    for gene_region_feature in gene_region_features:
        f.write(gene_region_feature.get_gff_line())
        gene_id = gene_region_feature.attr["gene_id"]

        for feature in transcript_region_features[gene_id]:
            f.write(feature.get_gff_line())

        for feature in CDS_region_features[gene_id]:
            f.write(feature.get_gff_line())

        for feature in five_UTR_region_features[gene_id]:
            f.write(feature.get_gff_line())

        for feature in three_UTR_region_features[gene_id]:
            f.write(feature.get_gff_line())

        for feature in gene_exons_bins[gene_id]:
            f.write(feature.get_gff_line())

        for feature in gene_introns_bins[gene_id]:
            f.write(feature.get_gff_line())

        for feature in gene_constitutive_exons_bins[gene_id]:
            f.write(feature.get_gff_line())

        for feature in gene_constitutive_introns_bins[gene_id]:
            f.write(feature.get_gff_line())

        for feature in [
                item[0] for item in gene_constitutive_junction[gene_id]
        ]:
            f.write(feature.get_gff_line())

    f.close()
Esempio n. 10
0
def create_sliding_exon_window_GTF(windowSize):
    overlap = windowSize / 2
    gtf_file = HTSeq.GFF_Reader(
        PATH_ANNOT + "/gencodeVM13/gencode.vM13.annotation.exon.gtf",
        end_included=True)
    windows = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    transcriptID = 1
    with open(PATH_ANNOT + "/gencodeVM13/gencode.vM13.exon.slidingwindow.gtf",
              "w") as slidingGTF:
        for feature in gtf_file:
            if feature.type == 'exon':
                interval = feature.iv
                transcriptID += 1
                if transcriptID % 1000 == 0:
                    print('Gene: ' + str(transcriptID) + '/ 100000')
                windowID = 1
                if interval.strand == '+':
                    begin = interval.start_d
                    end = begin + windowSize
                    while end < interval.end_d:
                        window = HTSeq.GenomicInterval(interval.chrom, begin,
                                                       end + 1,
                                                       interval.strand)
                        featureWindow = HTSeq.GenomicFeature(
                            feature.attr['transcript_name'] + '_' +
                            feature.attr['exon_number'] + '_window_' +
                            feature.attr['gene_type'] + '_' + str(windowID),
                            'window', window)
                        windowID += 1
                        #print(featureWindow.get_gff_line())
                        begin += overlap
                        end = begin + windowSize
                        slidingGTF.write(featureWindow.get_gff_line())

                    end = interval.end_d
                    window = HTSeq.GenomicInterval(interval.chrom, begin, end,
                                                   interval.strand)
                    featureWindow = HTSeq.GenomicFeature(
                        feature.attr['transcript_name'] + '_' +
                        feature.attr['exon_number'] + '_window_' +
                        feature.attr['gene_type'] + '_' + str(windowID),
                        'window', window)
                    if window.length > 1:
                        slidingGTF.write(featureWindow.get_gff_line())
                else:
                    #print(str(interval.end_d) + '  ' + str(interval.start_d))
                    begin = interval.start_d
                    end = begin - windowSize
                    while end > interval.end_d:
                        window = HTSeq.GenomicInterval(interval.chrom, end,
                                                       begin + 1,
                                                       interval.strand)
                        featureWindow = HTSeq.GenomicFeature(
                            feature.attr['transcript_name'] + '_' +
                            feature.attr['exon_number'] + '_window_' +
                            feature.attr['gene_type'] + '_' + str(windowID),
                            'window', window)
                        windowID += 1
                        #print(featureWindow.get_gff_line())
                        begin -= overlap
                        end = begin - windowSize
                        slidingGTF.write(featureWindow.get_gff_line())

                    end = interval.end_d
                    window = HTSeq.GenomicInterval(interval.chrom, end + 1,
                                                   begin, interval.strand)
                    featureWindow = HTSeq.GenomicFeature(
                        feature.attr['transcript_name'] + '_' +
                        feature.attr['exon_number'] + '_window_' +
                        feature.attr['gene_type'] + '_' + str(windowID),
                        'window', window)
                    if window.length > 1:
                        slidingGTF.write(featureWindow.get_gff_line())
Esempio n. 11
0
def main():
    """Main function."""

    optParser = optparse.OptionParser(
        usage="python %prog [options] <in.gtf> <out.gff>",
        description=(
            "Script to prepare annotation for DEXSeq."
            "This script takes an annotation file in Ensembl GTF format"
            "and outputs a 'flattened' annotation file suitable for use "
            "with the count_in_exons.py script "
        ),
        epilog=(
            "Written by Simon Anders ([email protected]), European Molecular Biology "
            "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General "
            "Public License v3. Part of the 'DEXSeq' package. "
            "Modified by Vivek Bhardwaj (just a bit!) to write featurecounts gtf as an option. "
            "Modified by Jost Vrabic Koren to work with python 3. "
        )
    )

    optParser.add_option(
        "-r", "--aggregate", type="choice", dest="aggregate",
        choices=("no", "yes"), default="yes",
        help=(
            "'yes' or 'no'. Indicates whether two or more genes sharing an exon should be merged"
            " into an 'aggregate gene'. If 'no', the exons that can not be assiged to a single gene"
            " are ignored."
        )
    )

    # add option for featurecounts output
    optParser.add_option("-f", "--featurecountsgtf", type="string", dest="fcgtf", action="store",
                         help="gtf file to write for featurecounts.")

    ##

    (opts, args) = optParser.parse_args()

    if len(args) != 2:
        sys.stderr.write("Script to prepare annotation for DEXSeq.\n\n")
        sys.stderr.write("Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]))
        sys.stderr.write("This script takes an annotation file in Ensembl GTF format\n")
        sys.stderr.write("and outputs a 'flattened' annotation file suitable for use\n")
        sys.stderr.write("with the count_in_exons.py script.\n")
        sys.exit(1)

    try:
        import HTSeq
    except ImportError:
        sys.stderr.write("Could not import HTSeq. Please install the HTSeq Python framework\n")
        sys.stderr.write("available from http://www-huber.embl.de/users/anders/HTSeq\n")
        sys.exit(1)




    gtf_file = args[0]
    out_file = args[1]

    aggregateGenes = opts.aggregate == "yes"

    # Step 1: Store all exons with their gene and transcript ID
    # in a GenomicArrayOfSets

    exons = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    for f in HTSeq.GFF_Reader(gtf_file):
        if f.type != "exon":
            continue
        f.attr['gene_id'] = f.attr['gene_id'].replace(":", "_")
        exons[f.iv] += (f.attr['gene_id'], f.attr['transcript_id'])


    # Step 2: Form sets of overlapping genes

    # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
    # contains IDs of genes that overlap, i.e., share bases (on the same strand).
    # The keys of 'gene_sets' are the IDs of all genes, and each key refers to
    # the set that contains the gene.
    # Each gene set forms an 'aggregate gene'.

    if aggregateGenes == True:
        gene_sets = collections.defaultdict(lambda: set())
        for iv, s in exons.steps():
            # For each step, make a set, 'full_set' of all the gene IDs occuring
            # in the present step, and also add all those gene IDs, whch have been
            # seen earlier to co-occur with each of the currently present gene IDs.
            full_set = set()
            for gene_id, transcript_id in s:
                full_set.add(gene_id)
                full_set |= gene_sets[gene_id]
            # Make sure that all genes that are now in full_set get associated
            # with full_set, i.e., get to know about their new partners
            for gene_id in full_set:
                assert gene_sets[gene_id] <= full_set
                gene_sets[gene_id] = full_set


    # Step 3: Go through the steps again to get the exonic sections. Each step
    # becomes an 'exonic part'. The exonic part is associated with an
    # aggregate gene, i.e., a gene set as determined in the previous step,
    # and a transcript set, containing all transcripts that occur in the step.
    # The results are stored in the dict 'aggregates', which contains, for each
    # aggregate ID, a list of all its exonic_part features.

    aggregates = collections.defaultdict(lambda: list())
    for iv, s in exons.steps():
        # Skip empty steps
        if len(s) == 0:
            continue
        gene_id = list(s)[0][0]
        ## if aggregateGenes=FALSE, ignore the exons associated to more than one gene ID
        if aggregateGenes == False:
            check_set = set()
            for geneID, transcript_id in s:
                check_set.add(geneID)
            if len(check_set) > 1:
                continue
            else:
                aggregate_id = gene_id
        # Take one of the gene IDs, find the others via gene sets, and
        # form the aggregate ID from all of them
        else:
            assert set(gene_id for gene_id, transcript_id in s) <= gene_sets[gene_id]
            aggregate_id = '+'.join(gene_sets[gene_id])
        # Make the feature and store it in 'aggregates'
        f = HTSeq.GenomicFeature(aggregate_id, "exonic_part", iv)
        f.source = os.path.basename(sys.argv[0])
    #   f.source = "camara"
        f.attr = {}
        f.attr['gene_id'] = aggregate_id
        transcript_set = set((transcript_id for gene_id, transcript_id in s))
        f.attr['transcripts'] = '+'.join(transcript_set)
        aggregates[aggregate_id].append(f)


    # Step 4: For each aggregate, number the exonic parts

    aggregate_features = []
    for l in aggregates.values():
        for i in range(len(l)-1):
            assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name"
            assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early"
            if l[i].iv.chrom != l[i+1].iv.chrom:
                raise ValueError(
                    "Same name found on two chromosomes: %s, %s" % (str(l[i]), str(l[i+1]))
                )
            if l[i].iv.strand != l[i+1].iv.strand:
                raise ValueError(
                    "Same name found on two strands: %s, %s" % (str(l[i]), str(l[i+1]))
                )
        aggr_feat = HTSeq.GenomicFeature(
            l[0].name, "aggregate_gene",
            HTSeq.GenomicInterval(l[0].iv.chrom, l[0].iv.start, l[-1].iv.end, l[0].iv.strand)
        )
        aggr_feat.source = os.path.basename(sys.argv[0])
        aggr_feat.attr = {'gene_id': aggr_feat.name}
        for i in range(len(l)):
            l[i].attr['exonic_part_number'] = "%03d" % (i+1)
        aggregate_features.append(aggr_feat)


    # Step 5: Sort the aggregates, then write everything out

    aggregate_features.sort(key=lambda f: (f.iv.chrom, f.iv.start))

    fout = open(out_file, "w")
    for aggr_feat in aggregate_features:
        fout.write(aggr_feat.get_gff_line())
        for f in aggregates[aggr_feat.name]:
            fout.write(f.get_gff_line())

    fout.close()

    ## modify file to print gtf if featurecounts gtf requested
    fcountgtf = opts.fcgtf

    if fcountgtf:
        os.system('sed s/aggregate_gene/gene/g ' + out_file + ' > ' + fcountgtf)
        os.system('sed -i s/exonic_part/exon/g ' + fcountgtf)
        print("Done!")
    else:
        print("Done!")
Esempio n. 12
0
                                         transcript]['gene_id'].iloc[0])
        genes = "+".join(genes)

        if genes not in gene_id.keys():  #if gene appears once
            f.attr['gene_id'] = genes
            gene_id[genes] = 1
        else:  #if gene appears more than once
            f.attr['gene_id'] = "__00".join([genes, str(gene_id[genes])])
            gene_id[genes] += 1
        transcipttogenes_id[transcript_id] = f.attr['gene_id']
    else:
        f.attr['gene_id'] = transcipttogenes_id[transcript_id]
    f.name = f.attr['gene_id']

    #Store f as GenomicFeature with new gene_id
    feat = HTSeq.GenomicFeature(
        f.name, f.type,
        HTSeq.GenomicInterval(f.iv.chrom, f.iv.start, f.iv.end, f.iv.strand))
    feat.attr = {}
    feat.attr['gene_id'] = f.attr['gene_id']
    if f.type == "exonic_part":
        feat.attr['transcripts'] = transcript_id
        feat.attr['exonic_part_number'] = f.attr['exonic_part_number']
    all_feat.append(feat)

###Step 5: Sort the aggregates, then write everything out
print("=====> WRITING RESULT TO FILE")
fout = open(out_file, "w")
for feat in all_feat:
    fout.write(feat.get_gff_line())
fout.close()
Esempio n. 13
0
def workflow1(file1, test=None):
    """workflow1: only consider "exon" and "CDS"

    :param file1: gtf filename
    :returns: None
    :rtype: None

    """
    transcripts = generate_transcripts(file1, test)
    ## * ######################### populate bed files: merge the range #########################
    for each_trans, feature_list in transcripts.iteritems():  # each_trans are just tids, using id to maintain order
        temp = re.split("[:|]",each_trans)
        chr_gene_id = "%s:%s" % (temp[0],temp[2])
        feature_list.sort()  # sorted by start_d
        if feature_list['exon'] == [] and feature_list['CDS'] == []:
            logging.debug("BAD: {each_trans} was skipped in function workflow1 because of both CDS and exon are missing".format(
                **locals()))
            continue
        ordered_CDS=feature_list['CDS']
        if len(ordered_CDS) > 1:
            for i in range(1, len(ordered_CDS) + 1):
                ordered_CDS[i-1].name = "{each_trans}|CDS.{}".format(i, each_trans = each_trans)
        elif len(ordered_CDS) == 1:
            ordered_CDS[0].name = "%s|CDS" % each_trans
        ordered_exon=feature_list['exon']
        ## special branch with only CDS but no exon
        if len(ordered_exon) == 0:
            ordered_exon = ordered_CDS
        ## change exon name
        elif len(ordered_exon) > 1:
            for i in range(1, len(ordered_exon) + 1):
                ordered_exon[i-1].name = "{each_trans}|exon.{}".format(i, each_trans = each_trans)
        else:
            ordered_exon[0].name = "%s|exon" % each_trans
        ## normal branch, ordered_exon is main player
        chrom=ordered_exon[0].iv.chrom
        strand=ordered_exon[0].iv.strand
        biotype = feature_list.biotype()  # add tag things here, just "coding", "non_coding" for now
        feature_list.tag = "coding" if ordered_CDS != [] else "non_coding"
        if biotype:
            biotype = "|%s" % ("_".join(biotype),)  # biotype should be unique
        else:
            biotype = ""
        ## _intron
        if len(ordered_exon) == 2:
            if strand == "+":
                intron = construct_iv(chrom, ordered_exon[0].iv.end, ordered_exon[1].iv.start, strand)
            else:
                intron = construct_iv(chrom, ordered_exon[1].iv.end, ordered_exon[0].iv.start, strand)
            feature_list.append(
                HTSeq.GenomicFeature("{each_trans}|intron".format(each_trans = each_trans),
                                     "_intron", intron)
            )
        elif len(ordered_exon) > 2:
            for i in range(1,len(ordered_exon)): # get all introns
                if strand == "+":
                    intron = construct_iv(chrom, ordered_exon[i-1].iv.end, ordered_exon[i].iv.start, strand)
                else:
                    intron = construct_iv(chrom, ordered_exon[i].iv.end, ordered_exon[i-1].iv.start, strand)
                feature_list.append(
                    HTSeq.GenomicFeature("{each_trans}|intron.{}".format(i, each_trans=each_trans),
                                         "_intron", intron))
        ## gene
        if strand == "+":
            gene = construct_iv(chrom, ordered_exon[0].iv.start, ordered_exon[-1].iv.end, strand)  # gene is an iv
        else:
            gene = construct_iv(chrom, ordered_exon[-1].iv.start, ordered_exon[0].iv.end, strand)
        feature_list.append(
            HTSeq.GenomicFeature("{each_trans}{biotype}".format(
                each_trans = each_trans,
                biotype = biotype), "_transcript", gene)
        )
        if len(ordered_CDS):
            ## _utr5
            if ordered_exon[0].iv.start_d != ordered_CDS[0].iv.start_d:
                if strand == "+":
                    utr5 = construct_iv(chrom, ordered_exon[0].iv.start, ordered_CDS[0].iv.start, strand)  # temporary
                else:
                    if ordered_CDS[0].iv.end > ordered_exon[0].iv.end:
                        continue
                    utr5 = construct_iv(chrom, ordered_CDS[0].iv.end, ordered_exon[0].iv.end, strand)
                prime5_exons = [x.iv for x in ordered_exon if x.iv.overlaps(utr5)]
                if len(prime5_exons) == 1:
                    feature_list.append(
                        HTSeq.GenomicFeature(
                            "{each_trans}|utr5".format(each_trans=each_trans),
                            "_utr5", utr5))
                else:
                    for i in range(1, len(prime5_exons) + 1):
                        if i == len(prime5_exons):
                            if strand == "+":
                                i_utr5 = construct_iv(chrom, prime5_exons[-1].start, utr5.end, strand)
                            else:
                                i_utr5 = construct_iv(chrom, utr5.start, prime5_exons[-1].end, strand)
                        else:
                            i_utr5 = prime5_exons[i-1]
                        feature_list.append(
                            HTSeq.GenomicFeature(
                                "{each_trans}|utr5.{}".format(i, each_trans=each_trans),
                                "_utr5", i_utr5))
            ## stop_codon and utr3
            if ordered_CDS[-1].iv.end_d != ordered_exon[-1].iv.end_d:
                ## get utr3
                if feature_list["stop_codon"] != []:
                    if abs(ordered_CDS[-1].iv.end_d - ordered_exon[-1].iv.end_d) != 3:
                        if strand == "+":
                            utr3 = construct_iv(chrom, ordered_CDS[-1].iv.end + 3, ordered_exon[-1].iv.end, strand)
                        else:
                            utr3 = construct_iv(chrom, ordered_exon[-1].iv.start, ordered_CDS[-1].iv.start - 3, strand)
                    else:
                        utr3 = False  # this is important
                else:
                    if strand == "+":
                        if ordered_CDS[-1].iv.end > ordered_exon[-1].iv.end:
                            continue
                        utr3 = construct_iv(chrom, ordered_CDS[-1].iv.end, ordered_exon[-1].iv.end, strand)
                    else:
                        utr3 = construct_iv(chrom, ordered_exon[-1].iv.start, ordered_CDS[-1].iv.start, strand)
                if utr3:        # 有可能没有utr3
                    prime3_exons = [x.iv for x in feature_list['exon'] if x.iv.overlaps(utr3)]
                    if len(prime3_exons) == 1:
                        feature_list.append(
                            HTSeq.GenomicFeature(
                                "{each_trans}|utr3".format(each_trans=each_trans),
                                "_utr3", utr3
                            )
                        )
                    else:
                        for i in range(1, len(prime3_exons) + 1):
                            if i == 1:
                                if strand == "+":
                                    i_utr3 = construct_iv(chrom, utr3.start, prime3_exons[0].end, strand)
                                else:
                                    i_utr3 = construct_iv(chrom, prime3_exons[0].start, utr3.end, strand)
                            else:
                                i_utr3 = prime3_exons[i-1]
                            feature_list.append(
                                HTSeq.GenomicFeature(
                                    "{each_trans}|utr3.{}".format(i, each_trans=each_trans),
                                    "_utr3", i_utr3
                                )
                            )
        ## promoter
        if strand == "+":
            ps = ordered_exon[0].iv.start - args.pl # promoter start position
            if ps <0:
                ps = 0
            promoter=construct_iv(chrom, ps, ordered_exon[0].iv.start, strand)
        else:                       # if strand == "-"
            ps = ordered_exon[0].iv.end + args.pl
#             if chrom == 'scaffold_42':  ##only for debug
#                 exit( '%s ' %  ordered_exon[0].name)
            if ps > chr_lengths[chrom]:
                ps = chr_lengths[chrom]
            if ordered_exon[0].iv.end > ps:
                print ( 'skipping:%s, %s, %s, %s' % (chrom, ordered_exon[0].iv.end, ps, strand), file=sys.stderr )
                continue
            promoter=construct_iv(chrom, ordered_exon[0].iv.end, ps, strand)
        feature_list.append(
            HTSeq.GenomicFeature(
                "{each_trans}|promoter".format(each_trans=each_trans),
                "_promoter", promoter
            )
        )
        ## record
        genes[chr_gene_id].append(gene) # using chr_gene_id: "chr:gene_id"
    ## create folders
    ## write to file
    for each_trans, feature_list in transcripts.iteritems():
        feature_list.report()
        if not args.no_whole_report:
            feature_list.whole_report()
    ## close all fhandlers
    for x in fhandlers:
        try:
            x.close()
        except:
            print("file handler %s cannot be closed, maybe you have closed it?" % x)
            pass
    if test:
        pdb.set_trace()
    ## * ################### try to output intergenic region ###################
    intergenic_f = open("%s/intergenic.bed" % outdir, "w")
    outer_genes = OrderedDict()

    for chr_gene_id, gs in genes.iteritems():
        chr_id = chr_gene_id.split(":")[0]
        gene_id = chr_gene_id.split(":")[1]
        if chr_id not in outer_genes:
            outer_genes[chr_id] = {}
        if len(gs) > 1:
            gene = construct_iv(gs[0].chrom, min([x.start for x in gs]),
                                         max([x.end for x in gs]), gs[0].strand) # combine multiple transcripts of the same gene
            outer_genes[chr_id][gene_id] = gene
        else:
            outer_genes[chr_id][gene_id] = gs[0]

    for chr_id in outer_genes:
        flag = 0
        former_g = ""                   # gene before interval
        next_g = ""                     # gene after interval
        for gene_id in sorted(outer_genes[chr_id].keys(), key=lambda x: outer_genes[chr_id][x].start): # super-low efficiency
            chr_gene_id = "%s:%s" %(chr_id, gene_id)
            gene = outer_genes[chr_id][gene_id]
            e = gene.end
            s = gene.start
            assert s < e, (chr_gene_id, gene, s, e)
            if flag == s:               # does this can happen?
                flag = e
                former_g = chr_gene_id
                continue
            elif flag < s:              # normal stuff
                next_g = chr_gene_id
                _id = former_g + "--" + next_g
                inter_g = construct_iv(gene.chrom, flag, s, "+")
                intergenic_f.write(str_iv2(inter_g, _id))
                flag = e
                former_g = chr_gene_id
                continue
            elif flag >= e:
                logging.debug("Former({}) may overlap with gene_id({})".format(former_g, gene_id))
                continue
            elif flag < e:
                flag = e
                former_g = chr_gene_id
                continue

        next_g = ""
        _id = former_g + "--" + next_g
        logging.debug("{0}({1}) has start of {flag}, end of {2}".format(gene.chrom,
                                                                        chr_id,
                                                                        chr_lengths[chr_id],
                                                                        flag=flag))
        inter_g = construct_iv(gene.chrom, flag, chr_lengths[chr_id], "+")
        logging.debug("OUTPUT last interval, former is %s" % former_g)
        intergenic_f.write(str_iv2(inter_g, _id))

    intergenic_f.close()
# aggregate ID, a list of all its exonic_part features.

aggregates = collections.defaultdict(lambda: list())
i = -1
for iv, s in exons.steps():
    # Skip omitted steps
    i += 1
    if exon_omit_list[i]:
        continue
    # Take one of the gene IDs, find the others via gene sets, and
    # form the aggregate ID from all of them
    gene_id = list(s)[0][0]
    assert set(gene_id for gene_id, transcript_id in s) <= gene_sets[gene_id]
    aggregate_id = '+'.join(gene_sets[gene_id])
    # Make the feature and store it in 'aggregates'
    f = HTSeq.GenomicFeature(aggregate_id, "exonic_part", iv)
    f.source = os.path.basename(sys.argv[1])
    f.attr = {}
    f.attr['gene_id'] = aggregate_id
    transcript_set = set((transcript_id for gene_id, transcript_id in s))
    f.attr['transcripts'] = '+'.join(transcript_set)
    aggregates[aggregate_id].append(f)

# Step 4: For each aggregate, number the exonic parts

aggregate_features = []
for l in aggregates.values():
    name = l[0].name
    chrom = l[0].iv.chrom
    strand = l[0].iv.strand
    start = l[0].iv.start
Esempio n. 15
0
def create_sliding_gene_window_GTF(path_annot, annotation_file):
    '''
    Prepare window file in GTF for Peak detection
    Every overlapping window of 100bp on every gene is calculated
    :return:
    '''
    print('Create ' + annotation_file +
          '.gene.slidingwindows.gtf file for Peak detection')
    print('Every overlapping window of 100bp on every gene is calculated')
    overlap = WINDOW_SIZE / 2
    gtf_file = HTSeq.GFF_Reader(path_annot + '/' + annotation_file +
                                '.annotation.gene.gtf',
                                end_included=True)
    windows = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    transcriptID = 1
    with open(path_annot + '/' + annotation_file + '.gene.slidingwindows.gtf',
              "w") as slidingGTF:
        for feature in gtf_file:
            interval = feature.iv
            transcriptID += 1
            if transcriptID % 10000 == 0:
                print('Gene: ' + str(transcriptID) + '/ 100000')
            windowID = 1
            if interval.strand == '+':
                begin = interval.start_d
                end = begin + WINDOW_SIZE
                while end < interval.end_d:
                    window = HTSeq.GenomicInterval(interval.chrom, begin,
                                                   end + 1, interval.strand)
                    featureWindow = HTSeq.GenomicFeature(
                        feature.attr['gene_id'] + '_window_' +
                        feature.attr['gene_type'] + '_' + str(windowID),
                        'window', window)
                    windowID += 1
                    #print(featureWindow.get_gff_line())
                    begin += overlap
                    end = begin + WINDOW_SIZE
                    slidingGTF.write(featureWindow.get_gff_line())

                end = interval.end_d
                window = HTSeq.GenomicInterval(interval.chrom, begin, end,
                                               interval.strand)
                featureWindow = HTSeq.GenomicFeature(
                    feature.attr['gene_id'] + '_window_' +
                    feature.attr['gene_type'] + '_' + str(windowID), 'window',
                    window)
                slidingGTF.write(featureWindow.get_gff_line())
            else:
                #print(str(interval.end_d) + '  ' + str(interval.start_d))
                begin = interval.start_d
                end = begin - WINDOW_SIZE
                while end > interval.end_d:
                    window = HTSeq.GenomicInterval(interval.chrom, end,
                                                   begin + 1, interval.strand)
                    featureWindow = HTSeq.GenomicFeature(
                        feature.attr['gene_id'] + '_window_' +
                        feature.attr['gene_type'] + '_' + str(windowID),
                        'window', window)
                    windowID += 1
                    #print(featureWindow.get_gff_line())
                    begin -= overlap
                    end = begin - WINDOW_SIZE
                    slidingGTF.write(featureWindow.get_gff_line())

                end = interval.end_d
                window = HTSeq.GenomicInterval(interval.chrom, end + 1, begin,
                                               interval.strand)
                featureWindow = HTSeq.GenomicFeature(
                    feature.attr['gene_id'] + '_window_' +
                    feature.attr['gene_type'] + '_' + str(windowID), 'window',
                    window)
                slidingGTF.write(featureWindow.get_gff_line())