def select_by_key(inputfile=None,
                  outputfile=None,
                  key=None,
                  value=None,
                  invert_match=False,
                  file_with_values=None,
                  col=0,
                  select_transcripts=False,
                  select_genes=False,
                  select_exons=False,
                  select_cds=False,
                  select_start_codon=False,
                  bed_format=False,
                  log=False,
                  separator="|",
                  names="transcript_id"):
    """Select lines from a GTF file based on attributes and
    associated values.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcripts:
        key = "feature"
        value = "transcript"

    elif select_cds:
        key = "feature"
        value = "CDS"

    elif select_start_codon:
        key = "feature"
        value = "start_codon"

    elif select_genes:
        key = "feature"
        value = "gene"

    elif select_exons:
        key = "feature"
        value = "exon"

    elif file_with_values is None:
        if key is None or value is None:
            message(
                "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.",
                type="ERROR")

    elif file_with_values is not None:
        if key is None:
            message("Please set -k.", type="ERROR")
        if value is not None:
            message("The -f and -v arguments are mutually exclusive.",
                    type="ERROR")

    # ----------------------------------------------------------------------
    # Load file with value
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True)

    if log:
        feat_before = len(gtf)

    if not file_with_values:
        value_list = value.split(",")
        gtf = gtf.select_by_key(key, value, invert_match)
    else:
        value_list = []

        for line in file_with_values:
            cols = line.split("\t")
            value_list += [cols[col - 1]]
        file_with_values.close()
        file_with_values = open(file_with_values.name)

        gtf = gtf.select_by_key(key=key,
                                invert_match=invert_match,
                                file_with_values=file_with_values,
                                col=col)

    if log:

        not_found = list(set(value_list) - set(all_values))
        feat_after = len(gtf)
        pct = feat_after / feat_before * 100

        message("Number of features before selection: %d" % feat_before)
        message("Fraction of feature selected: %.2f%%" % pct)

        if len(not_found):
            nfj = ",".join(not_found)
            max_letter = min(len(nfj), 50)
            if len(nfj) > 50:
                etc = "..."
            else:
                etc = ""
            message("Values not found: [" + ",".join(not_found)[:max_letter] +
                    etc + "].")
        else:
            message("Values not found: [].")

    # ----------------------------------------------------------------------
    # Write GTF file
    # ----------------------------------------------------------------------

    if not bed_format:

        gtf.write(outputfile, gc_off=True)

    else:
        nb_tokens = len(names.split(","))
        keys = "seqid,start,end," + names + ",score,strand"
        nb_fields = len(keys.split(","))

        for i in gtf.extract_data_iter_list(keys, zero_based=True):
            outputfile.write("\t".join([
                i[0],
                i[1],
                i[2],
                separator.join(i[3:(3 + nb_tokens)]),
                i[nb_fields - 2],
                i[nb_fields - 1],
            ]) + "\n")

    close_properly(outputfile, inputfile)
Exemple #2
0
def splicing_site(inputfile=None,
                  outputfile=None,
                  exon_numbering_key=False,
                  names="exon_id,transcript_id,gene_id",
                  separator="\t"):
    """
    Compute the locations of splice donor are acceptor  sites. You may extend them in 3' and 5' depending on your needs.
    """

    gtf = GTF(inputfile)

    nb_exons = gtf.nb_exons()

    info = "feature,seqid,start,end,transcript_id," + exon_numbering_key
    info += ",strand," + names

    exon_info = gtf.extract_data_iter_list(info)

    for i in exon_info:

        if i[0] == "exon":
            if i[5] == ".":
                message(
                    "Some exon lines do not contain any numbering. "
                    "Use add_exon_nb or set --exon-numbering-key to the proper key.",
                    type="ERROR")

            if i[6] == "+":
                if int(i[5]) < nb_exons[i[4]]:
                    out = [
                        i[1], i[3],
                        str(int(i[3]) + 1),
                        separator.join(["donor"] + i[7:]), i[5], i[6]
                    ]
                    outputfile.write("\t".join(out) + "\n")

                if int(i[5]) > 1:
                    out = [
                        i[1],
                        str(int(i[2]) - 2),
                        str(int(i[2]) - 1),
                        separator.join(["acceptor"] + i[7:]), i[5], i[6]
                    ]
                    outputfile.write("\t".join(out) + "\n")

            elif i[6] == "-":

                if int(i[5]) > 1:
                    out = [
                        i[1], i[3],
                        str(int(i[3]) + 1),
                        separator.join(["acceptor"] + i[7:]), i[5], i[6]
                    ]
                    outputfile.write("\t".join(out) + "\n")

                if int(i[5]) < nb_exons[i[4]]:
                    out = [
                        i[1],
                        str(int(i[2]) - 2),
                        str(int(i[2]) - 1),
                        separator.join(["donor"] + i[7:]), i[5], i[6]
                    ]

                    outputfile.write("\t".join(out) + "\n")

    gc.disable()
    close_properly(outputfile, inputfile)