Exemple #1
0
def intron_sizes(
        inputfile=None,
        outputfile=None,
        key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of intron sizes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    intron_bo = gtf.get_introns(by_transcript=True,
                                name=["transcript_id"],
                                intron_nb_in_name=False,
                                feat_name=False)

    strands = gtf.select_by_key("feature",
                                "transcript").extract_data("transcript_id,strand",
                                                           as_dict_of_values=True,
                                                           no_na=True,
                                                           nr=True,
                                                           hide_undef=True)

    intron_size = {tx: [] for tx in all_tx_ids}

    for bed_line in intron_bo:
        intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)]

    for tx_id in intron_size:
        if len(intron_size[tx_id]):
            if strands[tx_id] == "-":
                intron_size[tx_id] = ",".join(reversed(intron_size[tx_id]))
            else:
                intron_size[tx_id] = ",".join(intron_size[tx_id])
        else:
            intron_size[tx_id] = "0"
    if len(intron_size):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=intron_size,
                                     new_key=key_name)
    gtf.write(outputfile,
              gc_off=True)
    close_properly(outputfile, inputfile)
Exemple #2
0
def exon_sizes(inputfile=None, outputfile=None, key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of exon-size.
    """

    gtf = GTF(inputfile)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    tx_to_size_list = dict()
    exons_starts = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,start",
        as_dict_of_merged_list=True,
        no_na=True,
        nr=False)

    if not len(exons_starts):
        message("No exon found.", type="ERROR")

    exons_ends = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False)

    strands = gtf.select_by_key("feature", "transcript").extract_data(
        "transcript_id,strand",
        as_dict_of_values=True,
        no_na=True,
        nr=True,
        hide_undef=True)

    for tx_id in all_tx_ids:
        size_list = []
        for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]):
            size = str(int(e) - int(s) + 1)
            size_list += [size]
        if strands[tx_id] == "-":
            size_list = reversed(size_list)
        tx_to_size_list[tx_id] = ",".join(size_list)

    if len(tx_to_size_list):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_size_list,
                                     new_key=key_name)
    gtf.write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
Exemple #3
0
def nb_exons(inputfile=None,
             outputfile=None,
             key_name=None,
             text_format=False):
    """
    Count the number of exons in the gtf file.
    """

    gtf = GTF(inputfile)
    n_exons = defaultdict(int)

    # -------------------------------------------------------------------------
    # Computing number of  exon for each transcript in input GTF file
    #
    # -------------------------------------------------------------------------

    message("Computing number of exons for each transcript in input GTF file.")

    exon = gtf.select_by_key("feature", "exon")
    fields = exon.extract_data("transcript_id")

    for i in fields:
        tx_id = i[0]
        n_exons[tx_id] += 1

    if text_format:
        for tx_id in n_exons:
            outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) +
                             "\ttranscript\n")
    else:

        if len(n_exons):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=n_exons,
                                         new_key=key_name)
        gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #4
0
def divergent(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        no_strandness=False,
        no_annotation=False):
    """
Find transcript with divergent promoters.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_with_divergent = dict()
    dist_to_divergent = dict()
    tss_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")
    message("Getting tss coordinates.")

    tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tss position
    for i in tss_bo:
        tx_id_tss, gn_id_tss = i.name.split("||")
        tss_pos[tx_id_tss] = int(i.start)

    message("Getting promoter coordinates.")

    promoter_bo = tss_bo.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1,
                                                      2, 3,
                                                      4, 5])
    message("Intersecting...")

    if no_strandness:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=False)
    else:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=True)

    tmp_file = make_tmp_file("promoter_slop", ".bed")
    promoter_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
    prom_with_tss_bo.saveas(tmp_file.name)

    for i in prom_with_tss_bo:

        tx_id_tss, gn_id_tss = i.fields[9].split("||")
        tx_id_prom, gene_id_prom = i.fields[3].split("||")

        if gene_id_prom != gn_id_tss:
            if tx_id_prom in tx_with_divergent:
                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                if dist < dist_to_divergent[tx_id_prom]:
                    dist_to_divergent[tx_id_prom] = dist
                    tx_with_divergent[tx_id_prom] = tx_id_tss
            else:

                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                dist_to_divergent[tx_id_prom] = dist
                tx_with_divergent[tx_id_prom] = tx_id_tss

    if not no_annotation:

        if key_name is None:
            key_name = "divergent"
            key_name_dist = "dist_to_divergent"
        else:
            key_name_dist = "dist_" + key_name

        if len(tx_with_divergent):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=tx_with_divergent,
                                         new_key=key_name)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=dist_to_divergent,
                                         new_key=key_name_dist)

        gtf.write(outputfile,
                  gc_off=True)

    else:
        gtf.select_by_key("transcript_id",
                          ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #5
0
def overlapping(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        feature_type='transcript',
        same_strandedness=False,
        diff_strandedness=False,
        annotate_gtf=False,
        bool=False,
        annotate_all=False,
        invert_match=False):
    """
Description: Find transcripts whose body/TSS/TTS do or do not overlap with any
transcript from another gene.
    """

    # ----------------------------------------------------------------------
    # Prepare key names
    # ----------------------------------------------------------------------

    if annotate_gtf:
        if key_name is None:
            key_info = ["overlap",
                        feature_type,
                        "u" + str(upstream / 1000) + "k",
                        "d" + str(downstream / 1000) + "k"
                        ]
            key_name = "_".join(key_info)

        if invert_match:
            message("--annotate-gtf and --invert-match are "
                    "mutually exclusive.",
                    type="ERROR")

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    message("Using -u " + str(upstream))
    message("Using -d " + str(downstream))

    overlapping_tx = defaultdict(list)

    # Load the GTF so that it won't be lost
    # if GTF stream comes from stdin
    gtf = GTF(inputfile)

    message("Getting transcript in bed format")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")

    if annotate_all:
        overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0")
        for i in overlapping_tx:
            overlapping_tx[i] = []

    # ----------------------------------------------------------------------
    # Get transcript limits
    # ----------------------------------------------------------------------

    tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||")

    message("Getting " + feature_type + " and 'slopping'.")

    if feature_type == "transcript":

        bed_obj = tx_bed.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    elif feature_type == "promoter":

        bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])

    elif feature_type == "tts":

        bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])
    else:
        message("Not implemented yet", type="ERROR")

    tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed")
    bed_obj.saveas(tmp_file.name)

    overlap_regions = bed_obj.intersect(tx_bed,
                                        wb=True,
                                        s=same_strandedness,
                                        S=diff_strandedness)

    tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed")
    overlap_regions.saveas(tmp_file.name)

    for i in overlap_regions:

        tx_other, gn_other = i.fields[9].split("||")
        tx_id, gene_id = i.fields[3].split("||")
        if gene_id != gn_other:
            overlapping_tx[tx_id] += [tx_other]

    if bool:
        for k, _ in overlapping_tx.items():
            if not len(overlapping_tx[k]):
                overlapping_tx[k] = "0"
            else:
                overlapping_tx[k] = "1"

    if not invert_match:

        if not annotate_gtf:
            value = ",".join(set(overlapping_tx.keys()))
            gtf.select_by_key("transcript_id",
                              value).write(outputfile,
                                           gc_off=True)
        else:

            if len(overlapping_tx):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=overlapping_tx,
                                             new_key=key_name)
            gtf.write(outputfile,
                      gc_off=True)

    else:
        values = ",".join(set(overlapping_tx.keys()))
        gtf.select_by_key("transcript_id",
                          values,
                          invert_match).write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
def select_by_intron_size(inputfile=None,
                          outputfile=None,
                          intron_size=0,
                          merged=False,
                          invert_match=False,
                          delete_monoexonic=False,
                          add_intron_size=False):
    """
    Select genes which contain an intron of size at least s or whose sum of intron size is at least s
    """

    message("Searching for intronic regions.")

    gtf = GTF(inputfile, check_ensembl_format=False)

    introns_bo = gtf.get_introns(by_transcript=True,
                                 name=["transcript_id"],
                                 intron_nb_in_name=False).sort()

    # Get the list of transcripts
    all_tx_ids = gtf.get_tx_ids(nr=True)

    # The list of transcripts
    # to be deleted
    to_delete = OrderedDict()

    if merged:
        # Create a dict that will contain the sum of introns for
        # each transcript
        intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0)

        for i in introns_bo:
            size = i.end - i.start
            tx_id = i.name
            intron_sum_dict[tx_id] += size

        for tx_id, sum_intron in list(intron_sum_dict.items()):

            if sum_intron != 0:
                if not invert_match:
                    if sum_intron < intron_size:
                        to_delete[tx_id] = 1

                else:
                    if sum_intron >= intron_size:
                        to_delete[tx_id] = 1
            else:
                if delete_monoexonic:
                    to_delete[tx_id] = 1

        if add_intron_size:
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=intron_sum_dict,
                                         new_key="intron_size_sum")

    else:

        # Create a dict that will contain a list introns size
        # for each transcript

        intron_size_dict = defaultdict(list)

        for tx_id in all_tx_ids:
            intron_size_dict[tx_id] = []

        for i in introns_bo:
            size = i.end - i.start
            tx_id = i.name

            intron_size_dict[tx_id] += [size]

        for tx_id, list_size in list(intron_size_dict.items()):
            if not list_size:
                intron_size_dict[tx_id] = [0]
                if delete_monoexonic:
                    to_delete[tx_id] = 1
                continue

            for size in intron_size_dict[tx_id]:

                if not invert_match:
                    if size < intron_size:
                        to_delete[tx_id] = 1

                else:
                    if size >= intron_size:
                        to_delete[tx_id] = 1

        if add_intron_size:

            for tx_id, list_size in list(intron_size_dict.items()):
                list_size = [str(x) for x in list_size]
                intron_size_dict[tx_id] = ",".join(list_size)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=intron_size_dict,
                                         new_key="intron_size")

    all_tx_ids = gtf.get_tx_ids(nr=True)
    all_tx_ids = [x for x in all_tx_ids if x not in to_delete]
    msg_list = ",".join(list(to_delete.keys()))
    nb_char = min([len(msg_list), 40])
    msg_list = msg_list[0:nb_char]
    message("Deleting: " + msg_list + "...")

    gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids))

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def closest_genes(
        inputfile=None,
        outputfile=None,
        from_region_type=None,
        no_header=False,
        nb_neighbors=1,
        to_region_type=None,
        same_strandedness=False,
        diff_strandedness=False,
        text_format=False,
        identifier="gene_id",
        collapse=False):
    """
    Find the n closest genes for each gene.
    """

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile)
    gn_gtf = gtf.select_by_key("feature", "gene")
    gn_ids = gn_gtf.get_gn_ids(nr=True)

    if len(gn_gtf) == 0:
        message("No gene feature found. Please use convert_ensembl.",
                type="ERROR")
    if nb_neighbors >= (len(gn_gtf) - 1):
        message("Two much neighbors",
                type="ERROR")

    all_ids = gn_gtf.extract_data(identifier, as_list=True, no_na=False)

    if "." in all_ids:
        message("Some identifiers are undefined ('.').",
                type="ERROR")

    if len(all_ids) == 0:
        message("The identifier was not found.",
                type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF and requested regions (for source/'from' transcript)
    # ----------------------------------------------------------------------

    if from_region_type == 'tss':
        from_regions = gn_gtf.get_5p_end(feat_type="gene",
                                         name=[identifier],
                                         ).cut([0, 1, 2,
                                                3, 4, 5]).sort()
    elif from_region_type == 'tts':
        from_regions = gn_gtf.get_3p_end(feat_type="gene",
                                         name=[identifier],
                                         ).cut([0, 1, 2,
                                                3, 4, 5]).sort()
    elif from_region_type == 'gene':
        from_regions = gn_gtf.to_bed(name=[identifier],
                                     ).cut([0, 1, 2,
                                            3, 4, 5]).sort()
    else:
        message("Unknown type.", type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF and requested regions (for dest/'to' transcript)
    # ----------------------------------------------------------------------

    if to_region_type == 'tss':
        to_regions = gn_gtf.get_5p_end(feat_type="gene",
                                       name=[identifier],
                                       ).cut([0, 1, 2,
                                              3, 4, 5]).sort()
    elif to_region_type == 'tts':
        to_regions = gn_gtf.get_3p_end(feat_type="gene",
                                       name=[identifier],
                                       ).cut([0, 1, 2,
                                              3, 4, 5]).sort()

    elif to_region_type == 'gene':
        to_regions = gn_gtf.to_bed(name=[identifier],
                                   ).cut([0, 1, 2,
                                          3, 4, 5]).sort()
    else:
        message("Unknown type.", type="ERROR")

    # ----------------------------------------------------------------------
    # Search closest genes
    # ----------------------------------------------------------------------

    gene_closest = defaultdict(list)
    gene_closest_dist = defaultdict(list)

    closest_bo = from_regions.closest(b=to_regions,
                                      k=nb_neighbors,
                                      N=True,
                                      s=same_strandedness,
                                      S=diff_strandedness,
                                      d=True)

    for i in closest_bo:
        gene_closest[i[3]] += [i[9]]
        gene_closest_dist[i[3]] += [i[12]]

    if not text_format:

        if len(gene_closest):
            gtf = gtf.add_attr_from_dict(feat="gene",
                                         key=identifier,
                                         a_dict=gene_closest,
                                         new_key="closest_gn")

            gtf = gtf.add_attr_from_dict(feat="gene",
                                         key=identifier,
                                         a_dict=gene_closest_dist,
                                         new_key="closest_dist")

        gtf.write(outputfile, gc_off=True)

    else:
        if not no_header:
            outputfile.write("genes\tclosest_genes\tdistances\n")

        for gene in gn_ids:

            if not collapse:

                outputfile.write("\t".join([gene,
                                            ",".join(gene_closest[gene]),
                                            ",".join(gene_closest_dist[gene])]) + "\n")

            else:

                for closest, dist in zip(gene_closest[gene],
                                         gene_closest_dist[gene]):
                    outputfile.write("\t".join([gene,
                                                closest,
                                                dist]) + "\n")

        gc.disable()

    close_properly(outputfile, inputfile)
def convergent(inputfile=None,
               outputfile=None,
               upstream=1500,
               downstream=1500,
               chrom_info=None):
    """
    Find transcript with convergent tts.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_to_convergent_nm = dict()
    dist_to_convergent = dict()
    tts_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature", "transcript")

    message("Getting tts coordinates.")

    tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||")

    # get tts position
    for i in tts_bo:
        tx_id_ov, gn_id_ov = i.name.split("||")
        tts_pos[tx_id_ov] = int(i.start)

    message("Getting tts coordinates.")

    tts_region_bo = tts_bo.slop(s=True,
                                l=upstream,
                                r=downstream,
                                g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    message("Intersecting...")
    tts_intersect_bo = tts_region_bo.intersect(tts_bo,
                                               wb=True,
                                               s=False,
                                               S=True)

    tmp_file = make_tmp_file("tts_slop", ".bed")
    tts_region_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed")
    tts_intersect_bo.saveas(tmp_file.name)

    for i in tts_intersect_bo:

        tx_id_main, gene_id_main = i.fields[3].split("||")
        tx_id_ov, gn_id_ov = i.fields[9].split("||")

        if gene_id_main != gn_id_ov:
            if tx_id_main in tx_to_convergent_nm:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                if dist < dist_to_convergent[tx_id_main]:
                    dist_to_convergent[tx_id_main] = dist
                    tx_to_convergent_nm[tx_id_main] = tx_id_ov
            else:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                dist_to_convergent[tx_id_main] = dist
                tx_to_convergent_nm[tx_id_main] = tx_id_ov

    if len(tx_to_convergent_nm):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_convergent_nm,
                                     new_key="convergent")

        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=dist_to_convergent,
                                     new_key="dist_to_convergent")

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #9
0
def feature_size(inputfile=None,
                 outputfile=None,
                 ft_type="transcript",
                 names="transcript_id",
                 key_name='feature_size',
                 separator="|",
                 bed=False):
    """
 Get the size and limits (start/end) of features enclosed in the GTF. If bed
 format is requested returns the limits zero-based half open and the size as a score.
 Otherwise output GTF file with 'feat_size' as a new key and size as value.
    """

    message("Computing feature sizes.")

    gtf = GTF(inputfile)

    feat_list = gtf.get_feature_list(nr=True) + ['mature_rna']

    if ft_type not in feat_list + ["*"]:
        message("Unable to find requested feature.", type="ERROR")

    names = names.split(",")

    if ft_type != 'mature_rna':

        if bed:
            bed_obj = gtf.select_by_key("feature",
                                        ft_type).to_bed(name=names,
                                                        sep=separator,
                                                        add_feature_type=True)

            for i in bed_obj:
                i.score = str(i.end - i.start)
                write_properly(chomp(str(i)), outputfile)
        else:

            tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt")

            elmt = gtf.extract_data("feature,start,end",
                                    as_list_of_list=True,
                                    no_na=False,
                                    hide_undef=False)

            for i in elmt:
                if i[0] != ft_type and ft_type != "*":
                    tmp_file.write("?\n")
                else:
                    tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n")

            tmp_file.close()

            gtf.add_attr_column(tmp_file, key_name).write(outputfile,
                                                          gc_off=True)

    else:

        tx_size = gtf.get_transcript_size()

        if bed:
            bed_obj = gtf.select_by_key("feature", 'transcript').to_bed(
                ['transcript_id'] + names,
                add_feature_type=False,
                sep=separator,
                more_name=['mature_rna'])

            for i in bed_obj:
                names = i.name.split(separator)
                tx_id = names.pop(0)
                i.score = tx_size[tx_id]
                i.name = separator.join(names)
                write_properly(chomp(str(i)), outputfile)
        else:

            if len(tx_size):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=tx_size,
                                             new_key=key_name)

            gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)