Beispiel #1
0
def convert(inputfile=None,
            outputfile=None,
            format="bed",
            names="gene_id,transcript_id",
            separator="|",
            more_names=''):
    """
 Convert a GTF to various format.
    """

    if format == "bed3":
        gtf = GTF(inputfile, check_ensembl_format=False)

        for i in gtf.extract_data("seqid,start,end",
                                  as_list_of_list=True,
                                  hide_undef=False,
                                  no_na=False):
            i[1] = str(int(i[1]) - 1)
            outputfile.write("\t".join(i) + "\n")

    elif format in ["bed", "bed6"]:
        gtf = GTF(inputfile,
                  check_ensembl_format=False).write_bed(outputfile=outputfile,
                                                        name=names,
                                                        sep=separator,
                                                        more_name=more_names)
    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #2
0
def count(inputfile=None, outputfile=None, header=None, additional_text=None):
    """
    Count the number of features in the gtf file.
    """

    if header is not None:
        header = header.split(",")

    gtf = GTF(inputfile, check_ensembl_format=False)

    feat_nb = OrderedDict()

    for i in gtf.extract_data("feature"):
        i = i[0]
        if i in feat_nb:
            feat_nb[i] += 1
        else:
            feat_nb[i] = 1

    if header is not None:
        outputfile.write("\t".join(header) + "\n")

    for i in feat_nb:
        if additional_text is None:
            outputfile.write(i + "\t" + str(feat_nb[i]) + "\n")
        else:
            outputfile.write(i + "\t" + str(feat_nb[i]) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #3
0
def bed_to_gtf(inputfile=None,
               outputfile=None,
               ft_type="transcript",
               source="Unknown"):
    """
 Convert a bed file to a gtf. This will make the poor bed feel as if it was a
 nice gtf (but with lots of empty fields...). May be helpful sometimes...
    """

    message("Converting the bed file into GTF file.")

    if inputfile.name == '<stdin>':
        tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed")
        for i in inputfile:
            write_properly(chomp(str(i)), tmp_file)

        tmp_file.close()
        inputfile.close()

        bed_obj = BedTool(tmp_file.name)
    else:
        bed_obj = BedTool(inputfile.name)

    n = 1
    for i in bed_obj:

        if i.strand == "":
            i.strand = "."
        if i.name == "":
            i.name = str("feature_" + str(n))
        if i.score == "":
            i.score = "0"

        if ft_type == "exon":
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\"; " + \
                        "exon_id \"" + i.name + "\";"
        elif ft_type == "gene":
            key_value = "gene_id \"" + i.name + "\";"
        else:
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\";"

        if pygtftk.utils.ADD_CHR == 1:
            chrom_out = "chr" + i.chrom
        else:
            chrom_out = i.chrom

        list_out = [
            chrom_out, source, ft_type,
            str(i.start + 1),
            str(i.end),
            str(i.score), i.strand, ".", key_value
        ]

        write_properly("\t".join(list_out), outputfile)

        n += 1
    gc.disable()
    close_properly(outputfile)
Beispiel #4
0
def intronic(inputfile=None,
             outputfile=None,
             names='transcript_id',
             separator="_",
             intron_nb_in_name=False,
             no_feature_name=False,
             by_transcript=False):
    """
 Extract intronic regions.
    """

    message("Searching for intronic regions.")

    # Need to load if the gtf comes from
    # <stdin>
    gtf = GTF(inputfile, check_ensembl_format=False)

    if not by_transcript:
        introns_bo = gtf.get_introns()

        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)
    else:

        introns_bo = gtf.get_introns(by_transcript=True,
                                     name=names.split(","),
                                     sep=separator,
                                     intron_nb_in_name=intron_nb_in_name,
                                     feat_name=not no_feature_name)
        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #5
0
def get_attr_value_list(inputfile=None,
                        outputfile=None,
                        key_name="gene_id",
                        print_key_name=False,
                        separator="\n",
                        count=False):
    """
    Get the list of values observed for an attributes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if not count:
        for akey in key_name.split(","):
            for i in gtf.get_attr_value_list(akey):
                if print_key_name:
                    outputfile.write(akey + separator + i + "\n")
                else:
                    outputfile.write(i + "\n")
        gc.disable()
        close_properly(outputfile, inputfile)

    else:
        if separator == "\n":
            separator = "\t"

        for akey in key_name.split(","):
            for i in gtf.get_attr_value_list(akey, count=True):
                if print_key_name:
                    outputfile.write(akey + separator + i[0] + separator +
                                     i[1] + "\n")
                else:
                    outputfile.write(i[0] + separator + i[1] + "\n")
        gc.disable()
        close_properly(outputfile, inputfile)
Beispiel #6
0
def midpoints(inputfile=None,
              outputfile=None,
              ft_type="transcript",
              names="transcript_id",
              separator="|"):
    """
 Get the midpoint coordinates for the requested feature.
    """

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            is_gtf = True
        else:
            is_gtf = False

    if is_gtf:

        gtf = GTF(inputfile.name, check_ensembl_format=False)

        bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints(
            name=names.split(","), sep=separator)
        for line in bed_obj:
            write_properly(chomp(str(line)), outputfile)

    else:
        for line in region_bo:

            diff = line.end - line.start

            if diff % 2 != 0:
                # e.g 10-13 (zero based) -> 11-13 one based
                # mipoint is 12 (one-based) -> 11-12 (zero based)
                # e.g 949-1100 (zero based) -> 950-1100 one based
                # mipoint is 1025 (one-based) -> 1024-1025 (zero based)
                # floored division (python 2)...
                line.end = line.start + int(diff // 2) + 1
                line.start = line.end - 1
            else:
                # e.g 10-14 (zero based) -> 11-14 one based
                # mipoint is 12-13 (one-based) -> 11-13 (zero based)
                # e.g 9-5100 (zero based) -> 10-5100 one based
                # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based)
                # floored division (python 2)...
                # No real center. Take both

                line.start = line.start + int(diff // 2) - 1
                line.end = line.start + 2

            outputfile.write(str(line))

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #7
0
def del_attr(inputfile=None,
             outputfile=None,
             key="transcript_id",
             reg_exp=False,
             invert_match=False):
    """
    Delete extended attributes in the target gtf file. attr_list can be a
    comma-separated list of attributes.
    """

    # ----------------------------------------------------------------------
    # Read the GTF and get the list of attributes
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    attr_list = gtf.attr_extended

    # ----------------------------------------------------------------------
    # If regExp, select the corresponding keys
    # ----------------------------------------------------------------------

    if reg_exp:

        key_list = []

        try:
            rgxp = re.compile(key)
        except:
            message("Check the regular expression please.", type="ERROR")

        for attr in attr_list:
            if rgxp.search(attr):
                key_list += [attr]
    else:
        key_list = key.split(",")

    # ----------------------------------------------------------------------
    # If invert-match select all but the selected
    # ----------------------------------------------------------------------

    key_to_del = []
    if invert_match:
        for attr in attr_list:
            if attr not in key_list:
                key_to_del += [attr]
    else:
        key_to_del = key_list

    # ----------------------------------------------------------------------
    # Delete the keys
    # ----------------------------------------------------------------------

    gtf = gtf.del_attr(feat="*", keys=",".join(key_list),
                       force=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #8
0
def convert_ensembl(inputfile=None, outputfile=None, no_check_gene_chr=False):
    """
    Convert the GTF file to ensembl format.
    """

    GTF(inputfile, check_ensembl_format=False).convert_to_ensembl(
        check_gene_chr=not no_check_gene_chr, ).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #9
0
def seqid_list(inputfile=None, outputfile=None, separator=""):
    """
    Select the seqid/chromosomes.
    """

    for i in GTF(inputfile, check_ensembl_format=False).get_chroms(nr=True):
        outputfile.write(str(i) + separator)

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #10
0
def add_exon_nb(inputfile=None, outputfile=None, exon_numbering_key=None):
    """Add the exon number to each exon (based on 5' to 3' orientation)."""

    message("Calling nb_exons.", type="DEBUG")

    GTF(inputfile.name,
        check_ensembl_format=False).add_exon_number(exon_numbering_key).write(
            outputfile, gc_off=True)

    close_properly(inputfile, outputfile)
def get_feature_list(inputfile=None, outputfile=None, separator=""):
    """
    Get the list of features enclosed in the GTF.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)
    for i in gtf.get_feature_list(nr=True):
        outputfile.write(str(i) + separator)

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #12
0
def select_by_numeric_value(inputfile=None,
                            outputfile=None,
                            test=None,
                            na_omit=None):
    """Select lines from a GTF file based on a boolean test on numeric values.
    """

    GTF(inputfile, check_ensembl_format=False).eval_numeric(
        test,
        na_omit=na_omit,
    ).write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
def select_by_regexp(inputfile=None,
                     outputfile=None,
                     key=None,
                     regexp=None,
                     invert_match=False):
    """Select lines from a GTF file based on attributes and
    associated values.
    """

    GTF(inputfile, check_ensembl_format=False).select_by_regexp(
        key, regexp, invert_match).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #14
0
def add_prefix(inputfile=None,
               outputfile=None,
               key="transcript_id",
               text=None,
               target_feature="*",
               suffix=False):
    """
    Add a prefix to target values.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    gtf.add_prefix(target_feature, key, text, suffix).write(outputfile,
                                                            gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #15
0
def merge_attr(inputfile=None,
               outputfile=None,
               src_key="gene_id,transcript_id",
               separator="|",
               target_feature="*",
               dest_key="gene_tx_ids"):
    """
    Merge a set of attributes into a destination attribute.
    """

    GTF(inputfile,
        check_ensembl_format=False).merge_attr(target_feature, src_key,
                                               dest_key,
                                               separator).write(outputfile,
                                                                gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #16
0
def random_list(inputfile=None,
                outputfile=None,
                number=None,
                ft_type=None,
                seed_value=None):
    """
    Select a random list of genes or transcripts.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile)

    message("Getting ID list.")

    if ft_type == 'gene':
        id_list = gtf.extract_data("gene_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)
    else:
        id_list = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)

    if number > len(id_list):
        message("To much feature. Using : " + str(len(id_list)),
                type="WARNING")
        number = len(id_list)

    if seed_value is not None:
        random.seed(seed_value, version=1)

    id_list = random.sample(id_list, number)

    message("Printing.")

    my_id = ft_type + "_id"

    gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #17
0
def select_by_nb_exon(inputfile=None,
                      outputfile=None,
                      min_exon_number=None,
                      max_exon_number=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript by exon number (range: [{m},{M}])"
    msg = msg.format(m=str(min_exon_number), M=str(max_exon_number))
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons(
        min_exon_number, max_exon_number)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #18
0
def intron_sizes(
        inputfile=None,
        outputfile=None,
        key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of intron sizes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    intron_bo = gtf.get_introns(by_transcript=True,
                                name=["transcript_id"],
                                intron_nb_in_name=False,
                                feat_name=False)

    strands = gtf.select_by_key("feature",
                                "transcript").extract_data("transcript_id,strand",
                                                           as_dict_of_values=True,
                                                           no_na=True,
                                                           nr=True,
                                                           hide_undef=True)

    intron_size = {tx: [] for tx in all_tx_ids}

    for bed_line in intron_bo:
        intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)]

    for tx_id in intron_size:
        if len(intron_size[tx_id]):
            if strands[tx_id] == "-":
                intron_size[tx_id] = ",".join(reversed(intron_size[tx_id]))
            else:
                intron_size[tx_id] = ",".join(intron_size[tx_id])
        else:
            intron_size[tx_id] = "0"
    if len(intron_size):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=intron_size,
                                     new_key=key_name)
    gtf.write(outputfile,
              gc_off=True)
    close_properly(outputfile, inputfile)
Beispiel #19
0
def exon_sizes(inputfile=None, outputfile=None, key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of exon-size.
    """

    gtf = GTF(inputfile)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    tx_to_size_list = dict()
    exons_starts = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,start",
        as_dict_of_merged_list=True,
        no_na=True,
        nr=False)

    if not len(exons_starts):
        message("No exon found.", type="ERROR")

    exons_ends = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False)

    strands = gtf.select_by_key("feature", "transcript").extract_data(
        "transcript_id,strand",
        as_dict_of_values=True,
        no_na=True,
        nr=True,
        hide_undef=True)

    for tx_id in all_tx_ids:
        size_list = []
        for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]):
            size = str(int(e) - int(s) + 1)
            size_list += [size]
        if strands[tx_id] == "-":
            size_list = reversed(size_list)
        tx_to_size_list[tx_id] = ",".join(size_list)

    if len(tx_to_size_list):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_size_list,
                                     new_key=key_name)
    gtf.write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
Beispiel #20
0
def random_tx(inputfile=None,
              outputfile=None,
              max_transcript=None,
              seed_value=None):
    """
    Select randomly up to m transcript for each gene.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True)

    message("Getting gene_id and transcript_id")

    gene2tx = gtf.extract_data("gene_id,transcript_id",
                               as_dict_of_merged_list=True,
                               no_na=True,
                               nr=True)

    message("Selecting random transcript")

    if seed_value is not None:
        random.seed(seed_value, version=1)

    tx_to_delete = []

    for gn_id in gene2tx:
        tx_list = gene2tx[gn_id]
        nb_tx = len(tx_list)
        max_cur = min(max_transcript, nb_tx)
        pos_to_keep = random.sample(list(range(len(tx_list))), max_cur)
        tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep]
        tx_to_delete += tx_list

    message("Printing results")

    message("Selecting transcript.")
    gtf.select_by_key("transcript_id",
                      ",".join(tx_to_delete),
                      invert_match=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def join_multi_file(inputfile=None,
                    outputfile=None,
                    target_feature=None,
                    key_to_join=None,
                    matrix_files=()):
    """
    Join attributes from a set of tabulated files.
    """

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.", type="ERROR")
    else:
        target_feature = ",".join(feat_list)

    # -----------------------------------------------------------
    #  Do it
    # -----------------------------------------------------------

    for join_file in matrix_files:
        gtf = gtf.add_attr_from_matrix_file(feat=target_feature,
                                            key=key_to_join,
                                            inputfile=join_file.name)
    gtf.write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #22
0
def get_attr_list(
        inputfile=None,
        outputfile=None,
        separator="\n"):
    """
    Get the list of attributes from a GTF file.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)
    attr_list = gtf.get_attr_list()
    n = 0
    for i in attr_list:
        if n != len(attr_list) - 1:
            outputfile.write(i + separator)
        else:
            outputfile.write(i)
        n += 1

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #23
0
def intergenic(inputfile=None, outputfile=None, chrom_info=None):
    """
 Extract intergenic regions.
    """

    message("Searching for intergenic regions.")

    gtf = GTF(inputfile)

    intergenic_regions = gtf.get_intergenic(chrom_info)

    nb_intergenic_region = 1

    for i in intergenic_regions:
        i.name = "region_" + str(nb_intergenic_region)
        write_properly(chomp(str(i)), outputfile)
        nb_intergenic_region += 1

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #24
0
def count_key_values(inputfile=None,
                     outputfile=None,
                     keys="gene_id,transcript_id",
                     uniq=True,
                     additional_text=None):
    """
 Count the number values for a set of keys.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if uniq:
        val_list = defaultdict(set)
    else:
        val_list = defaultdict(list)

    if keys == "*":
        key_list = gtf.get_attr_list()
        keys = ",".join(key_list)
    else:
        key_list = keys.split(",")

    for i in gtf.extract_data(keys, as_list_of_list=True):

        for k, v in zip(key_list, i):
            if v in ['.', '?']:
                continue
            if uniq:
                val_list[k].add(v)
            else:
                val_list[k] += [v]

    for i in key_list:
        if additional_text is None:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #25
0
def nb_exons(inputfile=None,
             outputfile=None,
             key_name=None,
             text_format=False):
    """
    Count the number of exons in the gtf file.
    """

    gtf = GTF(inputfile)
    n_exons = defaultdict(int)

    # -------------------------------------------------------------------------
    # Computing number of  exon for each transcript in input GTF file
    #
    # -------------------------------------------------------------------------

    message("Computing number of exons for each transcript in input GTF file.")

    exon = gtf.select_by_key("feature", "exon")
    fields = exon.extract_data("transcript_id")

    for i in fields:
        tx_id = i[0]
        n_exons[tx_id] += 1

    if text_format:
        for tx_id in n_exons:
            outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) +
                             "\ttranscript\n")
    else:

        if len(n_exons):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=n_exons,
                                         new_key=key_name)
        gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def nb_transcripts(inputfile=None,
                   outputfile=None,
                   text_format=False,
                   key_name=""):
    """
    Compute the number of transcript per gene.
    """

    gtf = GTF(inputfile)

    message("Computing the number of transcript per gene in input GTF file.")

    # Computation of transcript number is performed on exon lines
    # Just in case some transcript lines would be lacking (but they should
    # not...)

    n_tx = gtf.get_gn_to_tx()

    if not text_format:
        tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt")

    for i in n_tx:
        if not text_format:
            tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n")

    if not text_format:
        tmp_file.close()
        gtf.add_attr_from_file(feat="gene",
                               key="gene_id",
                               new_key=key_name,
                               inputfile=tmp_file.name).write(outputfile,
                                                              gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #27
0
def divergent(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        no_strandness=False,
        no_annotation=False):
    """
Find transcript with divergent promoters.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_with_divergent = dict()
    dist_to_divergent = dict()
    tss_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")
    message("Getting tss coordinates.")

    tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tss position
    for i in tss_bo:
        tx_id_tss, gn_id_tss = i.name.split("||")
        tss_pos[tx_id_tss] = int(i.start)

    message("Getting promoter coordinates.")

    promoter_bo = tss_bo.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1,
                                                      2, 3,
                                                      4, 5])
    message("Intersecting...")

    if no_strandness:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=False)
    else:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=True)

    tmp_file = make_tmp_file("promoter_slop", ".bed")
    promoter_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
    prom_with_tss_bo.saveas(tmp_file.name)

    for i in prom_with_tss_bo:

        tx_id_tss, gn_id_tss = i.fields[9].split("||")
        tx_id_prom, gene_id_prom = i.fields[3].split("||")

        if gene_id_prom != gn_id_tss:
            if tx_id_prom in tx_with_divergent:
                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                if dist < dist_to_divergent[tx_id_prom]:
                    dist_to_divergent[tx_id_prom] = dist
                    tx_with_divergent[tx_id_prom] = tx_id_tss
            else:

                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                dist_to_divergent[tx_id_prom] = dist
                tx_with_divergent[tx_id_prom] = tx_id_tss

    if not no_annotation:

        if key_name is None:
            key_name = "divergent"
            key_name_dist = "dist_to_divergent"
        else:
            key_name_dist = "dist_" + key_name

        if len(tx_with_divergent):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=tx_with_divergent,
                                         new_key=key_name)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=dist_to_divergent,
                                         new_key=key_name_dist)

        gtf.write(outputfile,
                  gc_off=True)

    else:
        gtf.select_by_key("transcript_id",
                          ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #28
0
def tabulate(inputfile=None,
             outputfile=None,
             key=None,
             no_unset=False,
             unique=False,
             no_basic=False,
             accept_undef=False,
             select_gene_ids=False,
             select_gene_names=False,
             select_transcript_ids=False,
             select_exon_ids=False,
             separator="\t",
             no_header=False):
    """Convert a GTF to tabulated format.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcript_ids:
        key = "transcript_id"

    elif select_gene_ids:
        key = "gene_id"

    elif select_gene_names:
        key = "gene_id"

    elif select_exon_ids:
        key = "exon_id"

    no_undef = False
    if not accept_undef:
        no_undef = True
    # ----------------------------------------------------------------------
    # REad GTF and process
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    if key in ["all", "*"]:
        if no_basic:
            attr_list = gtf.get_attr_list(add_basic=False)
        else:
            attr_list = gtf.get_attr_list(add_basic=True)
        tab = gtf.extract_data(attr_list)
    else:
        tab = gtf.extract_data(key)

    if not no_header:
        message("Writing header")
        write_properly(separator.join(tab.colnames),
                       outputfile)

    message("Writing")

    try:
        if not unique:
            if no_unset:
                if no_undef:
                    for i in tab:
                        if any([True for x in i.fields if x in [".", "?"]]):
                            continue
                        i.write(outputfile, separator)
                else:
                    for i in tab:
                        if any([True for x in i.fields if x in ["."]]):
                            continue
                        i.write(outputfile, separator)

            else:
                if no_undef:
                    for i in tab:
                        if any([True for x in i.fields if x in ["?"]]):
                            continue
                        i.write(outputfile, separator)
                else:
                    for i in tab:
                        i.write(outputfile, separator)

        else:
            printed = {}
            if no_unset:
                if no_undef:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in [".", "?"]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
                else:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in ["."]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
            else:
                if no_undef:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in ["?"]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
                else:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            i.write(outputfile, separator)
                        printed[t] = 1

    except (BrokenPipeError, IOError):
        def _void_f(*args, **kwargs):
            pass

        message("Received a boken pipe signal", type="WARNING")
        sys.stdout.write = _void_f
        sys.stdout.flush = _void_f

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #29
0
def discretize_key(inputfile=None,
                   outputfile=None,
                   src_key=None,
                   dest_key="disc_key",
                   nb_levels=2,
                   percentiles=False,
                   percentiles_of_uniq=False,
                   precision=2,
                   log=False,
                   labels=None):
    """
    Create a new key by discretizing a numeric key.
    """

    if nb_levels < 2:
        message("--nb-levels has to be greater than 2.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check labels and nb_levels
    #
    # -------------------------------------------------------------------------

    if labels is not None:
        labels = labels.split(",")
        if len(labels) != nb_levels:
            message(
                "The number of labels should be the same as the number of levels.",
                type="ERROR")
        if len(labels) != len(set(labels)):
            message("Redundant labels not allowed.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Load GTF. Retrieve values for src-key
    #
    # -------------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    src_values = gtf.extract_data(src_key, as_list=True)

    if len([x for x in src_values if x not in ['.', '?']]) == 0:
        message('The key was not found in this GTF.', type="ERROR")

    min_val = None
    max_val = None

    dest_values = []
    dest_pos = []

    for p, v in enumerate(src_values):
        try:
            a = float(v)
            if min_val is not None:
                if a > max_val:
                    max_val = a
                if a < min_val:
                    min_val = a
            else:
                min_val = a
                max_val = a

            dest_values += [a]
            dest_pos += [p]
        except ValueError:
            pass

    if min_val is None:
        message("Did not find numeric values in the source key.", type="ERROR")
    if min_val == max_val:
        message(
            "The minimum and maximum values found in the source key are the same.",
            type="ERROR")

    if log:
        if 0 in dest_values:
            message("Encountered zero values before log transformation.",
                    type="WARNING",
                    force=True)
            message("Adding a pseudocount (+1).", type="WARNING", force=True)

            pseudo_count = 1
            dest_values = list(np.log2([x + pseudo_count
                                        for x in dest_values]))

        # update max/min values
        max_val = max(dest_values)
        min_val = min(dest_values)

    # Apply the same rule as pandas.cut when bins is an int.
    min_val = min_val - max_val / 1000

    # -------------------------------------------------------------------------
    #
    # Compute percentiles if required
    #
    # -------------------------------------------------------------------------

    if percentiles:
        if percentiles_of_uniq:
            dest_values_tmp = [min_val] + list(set(dest_values))
        else:
            dest_values_tmp = [min_val] + dest_values
        n = nb_levels

        q = [np.percentile(dest_values_tmp, 100 / n * i) for i in range(0, n)]
        q = q + [np.percentile(dest_values_tmp, 100)]

        if len(q) != len(set(q)):
            message("No ties are accepted in  percentiles.",
                    type="WARNING",
                    force=True)
            message("Breaks: " + str(q), type="WARNING", force=True)
            message("Try -u. Exiting", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Create a factor
    #
    # -------------------------------------------------------------------------

    if percentiles:

        (breaks, cat_label) = pandas.cut(dest_values,
                                         bins=q,
                                         labels=labels,
                                         retbins=True)
    else:
        (breaks, cat_label) = pandas.cut(dest_values,
                                         bins=nb_levels,
                                         labels=labels,
                                         retbins=True)

    if labels is None:
        # The include_lowest argument of pandas is not working.
        # Using this workaround to avoid minimum value outside of data range.
        cat_label[0] = min(dest_values)
        cat_label = [round(x, precision) for x in cat_label]
        if precision == 0:
            cat_label = [int(x) for x in cat_label]
        cat_label = [str(x) for x in list(zip(cat_label[:-1], cat_label[1:]))]
        cat_label[0] = cat_label[0].replace("(", "[")
        cat_label = [x.replace(")", "]") for x in cat_label]
        cat_label = [str(x).replace(", ", "_") for x in cat_label]

        # The string can be very problematic later...
        breaks.categories = cat_label

    message("Categories: " + str(list(breaks.categories)),
            type="INFO",
            force=True)

    # -------------------------------------------------------------------------
    #
    # Write to disk
    #
    # -------------------------------------------------------------------------

    tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt")

    with tmp_file as tp_file:
        for p, v in zip(dest_pos, breaks):
            tp_file.write(str(p) + "\t" + str(v) + '\n')

    gtf.add_attr_to_pos(tmp_file, new_key=dest_key).write(outputfile,
                                                          gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #30
0
def overlapping(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        feature_type='transcript',
        same_strandedness=False,
        diff_strandedness=False,
        annotate_gtf=False,
        bool=False,
        annotate_all=False,
        invert_match=False):
    """
Description: Find transcripts whose body/TSS/TTS do or do not overlap with any
transcript from another gene.
    """

    # ----------------------------------------------------------------------
    # Prepare key names
    # ----------------------------------------------------------------------

    if annotate_gtf:
        if key_name is None:
            key_info = ["overlap",
                        feature_type,
                        "u" + str(upstream / 1000) + "k",
                        "d" + str(downstream / 1000) + "k"
                        ]
            key_name = "_".join(key_info)

        if invert_match:
            message("--annotate-gtf and --invert-match are "
                    "mutually exclusive.",
                    type="ERROR")

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    message("Using -u " + str(upstream))
    message("Using -d " + str(downstream))

    overlapping_tx = defaultdict(list)

    # Load the GTF so that it won't be lost
    # if GTF stream comes from stdin
    gtf = GTF(inputfile)

    message("Getting transcript in bed format")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")

    if annotate_all:
        overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0")
        for i in overlapping_tx:
            overlapping_tx[i] = []

    # ----------------------------------------------------------------------
    # Get transcript limits
    # ----------------------------------------------------------------------

    tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||")

    message("Getting " + feature_type + " and 'slopping'.")

    if feature_type == "transcript":

        bed_obj = tx_bed.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    elif feature_type == "promoter":

        bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])

    elif feature_type == "tts":

        bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])
    else:
        message("Not implemented yet", type="ERROR")

    tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed")
    bed_obj.saveas(tmp_file.name)

    overlap_regions = bed_obj.intersect(tx_bed,
                                        wb=True,
                                        s=same_strandedness,
                                        S=diff_strandedness)

    tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed")
    overlap_regions.saveas(tmp_file.name)

    for i in overlap_regions:

        tx_other, gn_other = i.fields[9].split("||")
        tx_id, gene_id = i.fields[3].split("||")
        if gene_id != gn_other:
            overlapping_tx[tx_id] += [tx_other]

    if bool:
        for k, _ in overlapping_tx.items():
            if not len(overlapping_tx[k]):
                overlapping_tx[k] = "0"
            else:
                overlapping_tx[k] = "1"

    if not invert_match:

        if not annotate_gtf:
            value = ",".join(set(overlapping_tx.keys()))
            gtf.select_by_key("transcript_id",
                              value).write(outputfile,
                                           gc_off=True)
        else:

            if len(overlapping_tx):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=overlapping_tx,
                                             new_key=key_name)
            gtf.write(outputfile,
                      gc_off=True)

    else:
        values = ",".join(set(overlapping_tx.keys()))
        gtf.select_by_key("transcript_id",
                          values,
                          invert_match).write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)