Exemple #1
0
def count(inputfile=None, outputfile=None, header=None, additional_text=None):
    """
    Count the number of features in the gtf file.
    """

    if header is not None:
        header = header.split(",")

    gtf = GTF(inputfile, check_ensembl_format=False)

    feat_nb = OrderedDict()

    for i in gtf.extract_data("feature"):
        i = i[0]
        if i in feat_nb:
            feat_nb[i] += 1
        else:
            feat_nb[i] = 1

    if header is not None:
        outputfile.write("\t".join(header) + "\n")

    for i in feat_nb:
        if additional_text is None:
            outputfile.write(i + "\t" + str(feat_nb[i]) + "\n")
        else:
            outputfile.write(i + "\t" + str(feat_nb[i]) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #2
0
def intronic(inputfile=None,
             outputfile=None,
             names='transcript_id',
             separator="_",
             intron_nb_in_name=False,
             no_feature_name=False,
             by_transcript=False):
    """
 Extract intronic regions.
    """

    message("Searching for intronic regions.")

    # Need to load if the gtf comes from
    # <stdin>
    gtf = GTF(inputfile, check_ensembl_format=False)

    if not by_transcript:
        introns_bo = gtf.get_introns()

        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)
    else:

        introns_bo = gtf.get_introns(by_transcript=True,
                                     name=names.split(","),
                                     sep=separator,
                                     intron_nb_in_name=intron_nb_in_name,
                                     feat_name=not no_feature_name)
        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #3
0
def get_attr_value_list(inputfile=None,
                        outputfile=None,
                        key_name="gene_id",
                        print_key_name=False,
                        separator="\n",
                        count=False):
    """
    Get the list of values observed for an attributes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if not count:
        for akey in key_name.split(","):
            for i in gtf.get_attr_value_list(akey):
                if print_key_name:
                    outputfile.write(akey + separator + i + "\n")
                else:
                    outputfile.write(i + "\n")
        gc.disable()
        close_properly(outputfile, inputfile)

    else:
        if separator == "\n":
            separator = "\t"

        for akey in key_name.split(","):
            for i in gtf.get_attr_value_list(akey, count=True):
                if print_key_name:
                    outputfile.write(akey + separator + i[0] + separator +
                                     i[1] + "\n")
                else:
                    outputfile.write(i[0] + separator + i[1] + "\n")
        gc.disable()
        close_properly(outputfile, inputfile)
Exemple #4
0
def convert(inputfile=None,
            outputfile=None,
            format="bed",
            names="gene_id,transcript_id",
            separator="|",
            more_names=''):
    """
 Convert a GTF to various format.
    """

    if format == "bed3":
        gtf = GTF(inputfile, check_ensembl_format=False)

        for i in gtf.extract_data("seqid,start,end",
                                  as_list_of_list=True,
                                  hide_undef=False,
                                  no_na=False):
            i[1] = str(int(i[1]) - 1)
            outputfile.write("\t".join(i) + "\n")

    elif format in ["bed", "bed6"]:
        gtf = GTF(inputfile,
                  check_ensembl_format=False).write_bed(outputfile=outputfile,
                                                        name=names,
                                                        sep=separator,
                                                        more_name=more_names)
    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #5
0
def midpoints(inputfile=None,
              outputfile=None,
              ft_type="transcript",
              names="transcript_id",
              separator="|"):
    """
 Get the midpoint coordinates for the requested feature.
    """

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            is_gtf = True
        else:
            is_gtf = False

    if is_gtf:

        gtf = GTF(inputfile.name, check_ensembl_format=False)

        bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints(
            name=names.split(","), sep=separator)
        for line in bed_obj:
            write_properly(chomp(str(line)), outputfile)

    else:
        for line in region_bo:

            diff = line.end - line.start

            if diff % 2 != 0:
                # e.g 10-13 (zero based) -> 11-13 one based
                # mipoint is 12 (one-based) -> 11-12 (zero based)
                # e.g 949-1100 (zero based) -> 950-1100 one based
                # mipoint is 1025 (one-based) -> 1024-1025 (zero based)
                # floored division (python 2)...
                line.end = line.start + int(diff // 2) + 1
                line.start = line.end - 1
            else:
                # e.g 10-14 (zero based) -> 11-14 one based
                # mipoint is 12-13 (one-based) -> 11-13 (zero based)
                # e.g 9-5100 (zero based) -> 10-5100 one based
                # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based)
                # floored division (python 2)...
                # No real center. Take both

                line.start = line.start + int(diff // 2) - 1
                line.end = line.start + 2

            outputfile.write(str(line))

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #6
0
def del_attr(inputfile=None,
             outputfile=None,
             key="transcript_id",
             reg_exp=False,
             invert_match=False):
    """
    Delete extended attributes in the target gtf file. attr_list can be a
    comma-separated list of attributes.
    """

    # ----------------------------------------------------------------------
    # Read the GTF and get the list of attributes
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    attr_list = gtf.attr_extended

    # ----------------------------------------------------------------------
    # If regExp, select the corresponding keys
    # ----------------------------------------------------------------------

    if reg_exp:

        key_list = []

        try:
            rgxp = re.compile(key)
        except:
            message("Check the regular expression please.", type="ERROR")

        for attr in attr_list:
            if rgxp.search(attr):
                key_list += [attr]
    else:
        key_list = key.split(",")

    # ----------------------------------------------------------------------
    # If invert-match select all but the selected
    # ----------------------------------------------------------------------

    key_to_del = []
    if invert_match:
        for attr in attr_list:
            if attr not in key_list:
                key_to_del += [attr]
    else:
        key_to_del = key_list

    # ----------------------------------------------------------------------
    # Delete the keys
    # ----------------------------------------------------------------------

    gtf = gtf.del_attr(feat="*", keys=",".join(key_list),
                       force=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #7
0
def select_by_max_exon_nb(inputfile=None, outputfile=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript with the highest number of exon for each gene."
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_max_exon_nb()

    gtf.write(outputfile, gc_off=True)
def get_feature_list(inputfile=None, outputfile=None, separator=""):
    """
    Get the list of features enclosed in the GTF.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)
    for i in gtf.get_feature_list(nr=True):
        outputfile.write(str(i) + separator)

    gc.disable()
    close_properly(outputfile, inputfile)
def add_prefix(inputfile=None,
               outputfile=None,
               key="transcript_id",
               text=None,
               target_feature="*",
               suffix=False):
    """
    Add a prefix to target values.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    gtf.add_prefix(target_feature, key, text, suffix).write(outputfile,
                                                            gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #10
0
def convert_ensembl(inputfile=None, outputfile=None, no_check_gene_chr=False):
    """
    Convert the GTF file to ensembl format.
    """

    GTF(inputfile, check_ensembl_format=False).convert_to_ensembl(
        check_gene_chr=not no_check_gene_chr, ).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #11
0
def select_by_nb_exon(inputfile=None,
                      outputfile=None,
                      min_exon_number=None,
                      max_exon_number=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript by exon number (range: [{m},{M}])"
    msg = msg.format(m=str(min_exon_number), M=str(max_exon_number))
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons(
        min_exon_number, max_exon_number)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #12
0
def select_by_go(inputfile=None,
                 outputfile=None,
                 go_id=None,
                 https_proxy=None,
                 http_proxy=None,
                 list_datasets=None,
                 species=None,
                 invert_match=False):
    """ Select lines from a GTF file based using a Gene Ontology ID (e.g GO:0050789).
    """

    if not go_id.startswith("GO:"):
        go_id = "GO:" + go_id

    is_associated = OrderedDict()

    bm = Biomart(http_proxy=http_proxy,
                 https_proxy=https_proxy)

    bm.get_datasets('ENSEMBL_MART_ENSEMBL')

    if list_datasets:
        for i in sorted(bm.datasets):
            write_properly(i.replace("_gene_ensembl", ""), outputfile)
        sys.exit()
    else:
        if species + "_gene_ensembl" not in bm.datasets:
            message("Unknow dataset/species.", type="ERROR")

    bm.query({'query': XML.format(species=species, go=go_id)})

    for i in bm.response.content.decode().split("\n"):
        i = i.rstrip("\n")
        if i != '':
            is_associated[i] = 1

    gtf = GTF(inputfile)

    gtf_associated = gtf.select_by_key("gene_id",
                                       ",".join(list(is_associated.keys())),
                                       invert_match)

    gtf_associated.write(outputfile,
                         gc_off=True)
Exemple #13
0
def seqid_list(inputfile=None, outputfile=None, separator=""):
    """
    Select the seqid/chromosomes.
    """

    for i in GTF(inputfile, check_ensembl_format=False).get_chroms(nr=True):
        outputfile.write(str(i) + separator)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #14
0
def add_exon_nb(inputfile=None, outputfile=None, exon_numbering_key=None):
    """Add the exon number to each exon (based on 5' to 3' orientation)."""

    message("Calling nb_exons.", type="DEBUG")

    GTF(inputfile.name,
        check_ensembl_format=False).add_exon_number(exon_numbering_key).write(
            outputfile, gc_off=True)

    close_properly(inputfile, outputfile)
Exemple #15
0
def random_tx(inputfile=None,
              outputfile=None,
              max_transcript=None,
              seed_value=None):
    """
    Select randomly up to m transcript for each gene.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True)

    message("Getting gene_id and transcript_id")

    gene2tx = gtf.extract_data("gene_id,transcript_id",
                               as_dict_of_merged_list=True,
                               no_na=True,
                               nr=True)

    message("Selecting random transcript")

    if seed_value is not None:
        random.seed(seed_value, version=1)

    tx_to_delete = []

    for gn_id in gene2tx:
        tx_list = gene2tx[gn_id]
        nb_tx = len(tx_list)
        max_cur = min(max_transcript, nb_tx)
        pos_to_keep = random.sample(list(range(len(tx_list))), max_cur)
        tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep]
        tx_to_delete += tx_list

    message("Printing results")

    message("Selecting transcript.")
    gtf.select_by_key("transcript_id",
                      ",".join(tx_to_delete),
                      invert_match=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #16
0
def intergenic(inputfile=None, outputfile=None, chrom_info=None):
    """
 Extract intergenic regions.
    """

    message("Searching for intergenic regions.")

    gtf = GTF(inputfile)

    intergenic_regions = gtf.get_intergenic(chrom_info)

    nb_intergenic_region = 1

    for i in intergenic_regions:
        i.name = "region_" + str(nb_intergenic_region)
        write_properly(chomp(str(i)), outputfile)
        nb_intergenic_region += 1

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #17
0
def get_attr_list(
        inputfile=None,
        outputfile=None,
        separator="\n"):
    """
    Get the list of attributes from a GTF file.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)
    attr_list = gtf.get_attr_list()
    n = 0
    for i in attr_list:
        if n != len(attr_list) - 1:
            outputfile.write(i + separator)
        else:
            outputfile.write(i)
        n += 1

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #18
0
def select_by_numeric_value(inputfile=None,
                            outputfile=None,
                            test=None,
                            na_omit=None):
    """Select lines from a GTF file based on a boolean test on numeric values.
    """

    GTF(inputfile, check_ensembl_format=False).eval_numeric(
        test,
        na_omit=na_omit,
    ).write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
Exemple #19
0
def count_key_values(inputfile=None,
                     outputfile=None,
                     keys="gene_id,transcript_id",
                     uniq=True,
                     additional_text=None):
    """
 Count the number values for a set of keys.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if uniq:
        val_list = defaultdict(set)
    else:
        val_list = defaultdict(list)

    if keys == "*":
        key_list = gtf.get_attr_list()
        keys = ",".join(key_list)
    else:
        key_list = keys.split(",")

    for i in gtf.extract_data(keys, as_list_of_list=True):

        for k, v in zip(key_list, i):
            if v in ['.', '?']:
                continue
            if uniq:
                val_list[k].add(v)
            else:
                val_list[k] += [v]

    for i in key_list:
        if additional_text is None:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #20
0
def exon_sizes(inputfile=None, outputfile=None, key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of exon-size.
    """

    gtf = GTF(inputfile)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    tx_to_size_list = dict()
    exons_starts = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,start",
        as_dict_of_merged_list=True,
        no_na=True,
        nr=False)

    if not len(exons_starts):
        message("No exon found.", type="ERROR")

    exons_ends = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False)

    strands = gtf.select_by_key("feature", "transcript").extract_data(
        "transcript_id,strand",
        as_dict_of_values=True,
        no_na=True,
        nr=True,
        hide_undef=True)

    for tx_id in all_tx_ids:
        size_list = []
        for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]):
            size = str(int(e) - int(s) + 1)
            size_list += [size]
        if strands[tx_id] == "-":
            size_list = reversed(size_list)
        tx_to_size_list[tx_id] = ",".join(size_list)

    if len(tx_to_size_list):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_size_list,
                                     new_key=key_name)
    gtf.write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
def select_by_regexp(inputfile=None,
                     outputfile=None,
                     key=None,
                     regexp=None,
                     invert_match=False):
    """Select lines from a GTF file based on attributes and
    associated values.
    """

    GTF(inputfile, check_ensembl_format=False).select_by_regexp(
        key, regexp, invert_match).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def nb_transcripts(inputfile=None,
                   outputfile=None,
                   text_format=False,
                   key_name=""):
    """
    Compute the number of transcript per gene.
    """

    gtf = GTF(inputfile)

    message("Computing the number of transcript per gene in input GTF file.")

    # Computation of transcript number is performed on exon lines
    # Just in case some transcript lines would be lacking (but they should
    # not...)

    n_tx = gtf.get_gn_to_tx()

    if not text_format:
        tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt")

    for i in n_tx:
        if not text_format:
            tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n")

    if not text_format:
        tmp_file.close()
        gtf.add_attr_from_file(feat="gene",
                               key="gene_id",
                               new_key=key_name,
                               inputfile=tmp_file.name).write(outputfile,
                                                              gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #23
0
def intron_sizes(
        inputfile=None,
        outputfile=None,
        key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of intron sizes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    intron_bo = gtf.get_introns(by_transcript=True,
                                name=["transcript_id"],
                                intron_nb_in_name=False,
                                feat_name=False)

    strands = gtf.select_by_key("feature",
                                "transcript").extract_data("transcript_id,strand",
                                                           as_dict_of_values=True,
                                                           no_na=True,
                                                           nr=True,
                                                           hide_undef=True)

    intron_size = {tx: [] for tx in all_tx_ids}

    for bed_line in intron_bo:
        intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)]

    for tx_id in intron_size:
        if len(intron_size[tx_id]):
            if strands[tx_id] == "-":
                intron_size[tx_id] = ",".join(reversed(intron_size[tx_id]))
            else:
                intron_size[tx_id] = ",".join(intron_size[tx_id])
        else:
            intron_size[tx_id] = "0"
    if len(intron_size):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=intron_size,
                                     new_key=key_name)
    gtf.write(outputfile,
              gc_off=True)
    close_properly(outputfile, inputfile)
Exemple #24
0
def merge_attr(inputfile=None,
               outputfile=None,
               src_key="gene_id,transcript_id",
               separator="|",
               target_feature="*",
               dest_key="gene_tx_ids"):
    """
    Merge a set of attributes into a destination attribute.
    """

    GTF(inputfile,
        check_ensembl_format=False).merge_attr(target_feature, src_key,
                                               dest_key,
                                               separator).write(outputfile,
                                                                gc_off=True)

    close_properly(outputfile, inputfile)
def select_by_tx_size(inputfile=None,
                      outputfile=None,
                      min_size=None,
                      max_size=None):
    """
    Select features by size.
    """

    msg = "Selecting 'mature/spliced transcript by size (range: [{m},{M}])."
    msg = msg.format(m=str(min_size),
                     M=str(max_size))
    message(msg)

    GTF(inputfile
        ).select_by_transcript_size(min_size,
                                    max_size
                                    ).write(outputfile,
                                            gc_off=True)
def select_most_5p_tx(inputfile=None, outputfile=None, keep_gene_lines=False):
    """
    Select the most 5' transcript of each gene.
    """

    message("Selecting the most 5' transcript of each gene.")

    gtf = GTF(inputfile)

    if keep_gene_lines:
        gtf = gtf.select_5p_transcript()
    else:
        gtf = gtf.select_5p_transcript().select_by_key("feature", "gene", 1)

    gtf.write(outputfile, gc_off=True)
Exemple #27
0
def short_long(inputfile=None,
               outputfile=None,
               longs=None,
               keep_gene_lines=False):
    """ Select the shortest transcript for each gene, Or the longuest if the \
-l arguments is used. """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if longs:
        gtf = gtf.select_longuest_transcripts()
    else:
        gtf = gtf.select_shortest_transcripts()

    if not keep_gene_lines:
        gtf = gtf.select_by_key("feature", "gene", 1)

    gtf.write(outputfile,
              gc_off=True)
Exemple #28
0
def random_list(inputfile=None,
                outputfile=None,
                number=None,
                ft_type=None,
                seed_value=None):
    """
    Select a random list of genes or transcripts.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile)

    message("Getting ID list.")

    if ft_type == 'gene':
        id_list = gtf.extract_data("gene_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)
    else:
        id_list = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)

    if number > len(id_list):
        message("To much feature. Using : " + str(len(id_list)),
                type="WARNING")
        number = len(id_list)

    if seed_value is not None:
        random.seed(seed_value, version=1)

    id_list = random.sample(id_list, number)

    message("Printing.")

    my_id = ft_type + "_id"

    gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def join_multi_file(inputfile=None,
                    outputfile=None,
                    target_feature=None,
                    key_to_join=None,
                    matrix_files=()):
    """
    Join attributes from a set of tabulated files.
    """

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.", type="ERROR")
    else:
        target_feature = ",".join(feat_list)

    # -----------------------------------------------------------
    #  Do it
    # -----------------------------------------------------------

    for join_file in matrix_files:
        gtf = gtf.add_attr_from_matrix_file(feat=target_feature,
                                            key=key_to_join,
                                            inputfile=join_file.name)
    gtf.write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #30
0
def nb_exons(inputfile=None,
             outputfile=None,
             key_name=None,
             text_format=False):
    """
    Count the number of exons in the gtf file.
    """

    gtf = GTF(inputfile)
    n_exons = defaultdict(int)

    # -------------------------------------------------------------------------
    # Computing number of  exon for each transcript in input GTF file
    #
    # -------------------------------------------------------------------------

    message("Computing number of exons for each transcript in input GTF file.")

    exon = gtf.select_by_key("feature", "exon")
    fields = exon.extract_data("transcript_id")

    for i in fields:
        tx_id = i[0]
        n_exons[tx_id] += 1

    if text_format:
        for tx_id in n_exons:
            outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) +
                             "\ttranscript\n")
    else:

        if len(n_exons):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=n_exons,
                                         new_key=key_name)
        gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)