Esempio n. 1
0
def count(inputfile=None, outputfile=None, header=None, additional_text=None):
    """
    Count the number of features in the gtf file.
    """

    if header is not None:
        header = header.split(",")

    gtf = GTF(inputfile, check_ensembl_format=False)

    feat_nb = OrderedDict()

    for i in gtf.extract_data("feature"):
        i = i[0]
        if i in feat_nb:
            feat_nb[i] += 1
        else:
            feat_nb[i] = 1

    if header is not None:
        outputfile.write("\t".join(header) + "\n")

    for i in feat_nb:
        if additional_text is None:
            outputfile.write(i + "\t" + str(feat_nb[i]) + "\n")
        else:
            outputfile.write(i + "\t" + str(feat_nb[i]) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
Esempio n. 2
0
def convert(inputfile=None,
            outputfile=None,
            format="bed",
            names="gene_id,transcript_id",
            separator="|",
            more_names=''):
    """
 Convert a GTF to various format.
    """

    if format == "bed3":
        gtf = GTF(inputfile, check_ensembl_format=False)

        for i in gtf.extract_data("seqid,start,end",
                                  as_list_of_list=True,
                                  hide_undef=False,
                                  no_na=False):
            i[1] = str(int(i[1]) - 1)
            outputfile.write("\t".join(i) + "\n")

    elif format in ["bed", "bed6"]:
        gtf = GTF(inputfile,
                  check_ensembl_format=False).write_bed(outputfile=outputfile,
                                                        name=names,
                                                        sep=separator,
                                                        more_name=more_names)
    gc.disable()
    close_properly(outputfile, inputfile)
Esempio n. 3
0
def random_list(inputfile=None,
                outputfile=None,
                number=None,
                ft_type=None,
                seed_value=None):
    """
    Select a random list of genes or transcripts.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile)

    message("Getting ID list.")

    if ft_type == 'gene':
        id_list = gtf.extract_data("gene_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)
    else:
        id_list = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)

    if number > len(id_list):
        message("To much feature. Using : " + str(len(id_list)),
                type="WARNING")
        number = len(id_list)

    if seed_value is not None:
        random.seed(seed_value, version=1)

    id_list = random.sample(id_list, number)

    message("Printing.")

    my_id = ft_type + "_id"

    gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Esempio n. 4
0
def random_tx(inputfile=None,
              outputfile=None,
              max_transcript=None,
              seed_value=None):
    """
    Select randomly up to m transcript for each gene.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True)

    message("Getting gene_id and transcript_id")

    gene2tx = gtf.extract_data("gene_id,transcript_id",
                               as_dict_of_merged_list=True,
                               no_na=True,
                               nr=True)

    message("Selecting random transcript")

    if seed_value is not None:
        random.seed(seed_value, version=1)

    tx_to_delete = []

    for gn_id in gene2tx:
        tx_list = gene2tx[gn_id]
        nb_tx = len(tx_list)
        max_cur = min(max_transcript, nb_tx)
        pos_to_keep = random.sample(list(range(len(tx_list))), max_cur)
        tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep]
        tx_to_delete += tx_list

    message("Printing results")

    message("Selecting transcript.")
    gtf.select_by_key("transcript_id",
                      ",".join(tx_to_delete),
                      invert_match=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Esempio n. 5
0
def count_key_values(inputfile=None,
                     outputfile=None,
                     keys="gene_id,transcript_id",
                     uniq=True,
                     additional_text=None):
    """
 Count the number values for a set of keys.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if uniq:
        val_list = defaultdict(set)
    else:
        val_list = defaultdict(list)

    if keys == "*":
        key_list = gtf.get_attr_list()
        keys = ",".join(key_list)
    else:
        key_list = keys.split(",")

    for i in gtf.extract_data(keys, as_list_of_list=True):

        for k, v in zip(key_list, i):
            if v in ['.', '?']:
                continue
            if uniq:
                val_list[k].add(v)
            else:
                val_list[k] += [v]

    for i in key_list:
        if additional_text is None:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(val_list[i])) + "\t" +
                             additional_text + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
Esempio n. 6
0
def tabulate(inputfile=None,
             outputfile=None,
             key=None,
             no_unset=False,
             unique=False,
             no_basic=False,
             accept_undef=False,
             select_gene_ids=False,
             select_gene_names=False,
             select_transcript_ids=False,
             select_exon_ids=False,
             separator="\t",
             no_header=False):
    """Convert a GTF to tabulated format.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcript_ids:
        key = "transcript_id"

    elif select_gene_ids:
        key = "gene_id"

    elif select_gene_names:
        key = "gene_id"

    elif select_exon_ids:
        key = "exon_id"

    no_undef = False
    if not accept_undef:
        no_undef = True
    # ----------------------------------------------------------------------
    # REad GTF and process
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    if key in ["all", "*"]:
        if no_basic:
            attr_list = gtf.get_attr_list(add_basic=False)
        else:
            attr_list = gtf.get_attr_list(add_basic=True)
        tab = gtf.extract_data(attr_list)
    else:
        tab = gtf.extract_data(key)

    if not no_header:
        message("Writing header")
        write_properly(separator.join(tab.colnames),
                       outputfile)

    message("Writing")

    try:
        if not unique:
            if no_unset:
                if no_undef:
                    for i in tab:
                        if any([True for x in i.fields if x in [".", "?"]]):
                            continue
                        i.write(outputfile, separator)
                else:
                    for i in tab:
                        if any([True for x in i.fields if x in ["."]]):
                            continue
                        i.write(outputfile, separator)

            else:
                if no_undef:
                    for i in tab:
                        if any([True for x in i.fields if x in ["?"]]):
                            continue
                        i.write(outputfile, separator)
                else:
                    for i in tab:
                        i.write(outputfile, separator)

        else:
            printed = {}
            if no_unset:
                if no_undef:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in [".", "?"]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
                else:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in ["."]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
            else:
                if no_undef:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            if any([True for x in i.fields if x in ["?"]]):
                                continue
                            i.write(outputfile, separator)
                        printed[t] = 1
                else:
                    for i in tab:
                        t = tuple(i)
                        if t not in printed:
                            i.write(outputfile, separator)
                        printed[t] = 1

    except (BrokenPipeError, IOError):
        def _void_f(*args, **kwargs):
            pass

        message("Received a boken pipe signal", type="WARNING")
        sys.stdout.write = _void_f
        sys.stdout.flush = _void_f

    gc.disable()
    close_properly(outputfile, inputfile)
Esempio n. 7
0
def discretize_key(inputfile=None,
                   outputfile=None,
                   src_key=None,
                   dest_key="disc_key",
                   nb_levels=2,
                   percentiles=False,
                   percentiles_of_uniq=False,
                   precision=2,
                   log=False,
                   labels=None):
    """
    Create a new key by discretizing a numeric key.
    """

    if nb_levels < 2:
        message("--nb-levels has to be greater than 2.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check labels and nb_levels
    #
    # -------------------------------------------------------------------------

    if labels is not None:
        labels = labels.split(",")
        if len(labels) != nb_levels:
            message(
                "The number of labels should be the same as the number of levels.",
                type="ERROR")
        if len(labels) != len(set(labels)):
            message("Redundant labels not allowed.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Load GTF. Retrieve values for src-key
    #
    # -------------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    src_values = gtf.extract_data(src_key, as_list=True)

    if len([x for x in src_values if x not in ['.', '?']]) == 0:
        message('The key was not found in this GTF.', type="ERROR")

    min_val = None
    max_val = None

    dest_values = []
    dest_pos = []

    for p, v in enumerate(src_values):
        try:
            a = float(v)
            if min_val is not None:
                if a > max_val:
                    max_val = a
                if a < min_val:
                    min_val = a
            else:
                min_val = a
                max_val = a

            dest_values += [a]
            dest_pos += [p]
        except ValueError:
            pass

    if min_val is None:
        message("Did not find numeric values in the source key.", type="ERROR")
    if min_val == max_val:
        message(
            "The minimum and maximum values found in the source key are the same.",
            type="ERROR")

    if log:
        if 0 in dest_values:
            message("Encountered zero values before log transformation.",
                    type="WARNING",
                    force=True)
            message("Adding a pseudocount (+1).", type="WARNING", force=True)

            pseudo_count = 1
            dest_values = list(np.log2([x + pseudo_count
                                        for x in dest_values]))

        # update max/min values
        max_val = max(dest_values)
        min_val = min(dest_values)

    # Apply the same rule as pandas.cut when bins is an int.
    min_val = min_val - max_val / 1000

    # -------------------------------------------------------------------------
    #
    # Compute percentiles if required
    #
    # -------------------------------------------------------------------------

    if percentiles:
        if percentiles_of_uniq:
            dest_values_tmp = [min_val] + list(set(dest_values))
        else:
            dest_values_tmp = [min_val] + dest_values
        n = nb_levels

        q = [np.percentile(dest_values_tmp, 100 / n * i) for i in range(0, n)]
        q = q + [np.percentile(dest_values_tmp, 100)]

        if len(q) != len(set(q)):
            message("No ties are accepted in  percentiles.",
                    type="WARNING",
                    force=True)
            message("Breaks: " + str(q), type="WARNING", force=True)
            message("Try -u. Exiting", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Create a factor
    #
    # -------------------------------------------------------------------------

    if percentiles:

        (breaks, cat_label) = pandas.cut(dest_values,
                                         bins=q,
                                         labels=labels,
                                         retbins=True)
    else:
        (breaks, cat_label) = pandas.cut(dest_values,
                                         bins=nb_levels,
                                         labels=labels,
                                         retbins=True)

    if labels is None:
        # The include_lowest argument of pandas is not working.
        # Using this workaround to avoid minimum value outside of data range.
        cat_label[0] = min(dest_values)
        cat_label = [round(x, precision) for x in cat_label]
        if precision == 0:
            cat_label = [int(x) for x in cat_label]
        cat_label = [str(x) for x in list(zip(cat_label[:-1], cat_label[1:]))]
        cat_label[0] = cat_label[0].replace("(", "[")
        cat_label = [x.replace(")", "]") for x in cat_label]
        cat_label = [str(x).replace(", ", "_") for x in cat_label]

        # The string can be very problematic later...
        breaks.categories = cat_label

    message("Categories: " + str(list(breaks.categories)),
            type="INFO",
            force=True)

    # -------------------------------------------------------------------------
    #
    # Write to disk
    #
    # -------------------------------------------------------------------------

    tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt")

    with tmp_file as tp_file:
        for p, v in zip(dest_pos, breaks):
            tp_file.write(str(p) + "\t" + str(v) + '\n')

    gtf.add_attr_to_pos(tmp_file, new_key=dest_key).write(outputfile,
                                                          gc_off=True)

    close_properly(outputfile, inputfile)
Esempio n. 8
0
def overlapping(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        feature_type='transcript',
        same_strandedness=False,
        diff_strandedness=False,
        annotate_gtf=False,
        bool=False,
        annotate_all=False,
        invert_match=False):
    """
Description: Find transcripts whose body/TSS/TTS do or do not overlap with any
transcript from another gene.
    """

    # ----------------------------------------------------------------------
    # Prepare key names
    # ----------------------------------------------------------------------

    if annotate_gtf:
        if key_name is None:
            key_info = ["overlap",
                        feature_type,
                        "u" + str(upstream / 1000) + "k",
                        "d" + str(downstream / 1000) + "k"
                        ]
            key_name = "_".join(key_info)

        if invert_match:
            message("--annotate-gtf and --invert-match are "
                    "mutually exclusive.",
                    type="ERROR")

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    message("Using -u " + str(upstream))
    message("Using -d " + str(downstream))

    overlapping_tx = defaultdict(list)

    # Load the GTF so that it won't be lost
    # if GTF stream comes from stdin
    gtf = GTF(inputfile)

    message("Getting transcript in bed format")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")

    if annotate_all:
        overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0")
        for i in overlapping_tx:
            overlapping_tx[i] = []

    # ----------------------------------------------------------------------
    # Get transcript limits
    # ----------------------------------------------------------------------

    tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||")

    message("Getting " + feature_type + " and 'slopping'.")

    if feature_type == "transcript":

        bed_obj = tx_bed.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    elif feature_type == "promoter":

        bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])

    elif feature_type == "tts":

        bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])
    else:
        message("Not implemented yet", type="ERROR")

    tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed")
    bed_obj.saveas(tmp_file.name)

    overlap_regions = bed_obj.intersect(tx_bed,
                                        wb=True,
                                        s=same_strandedness,
                                        S=diff_strandedness)

    tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed")
    overlap_regions.saveas(tmp_file.name)

    for i in overlap_regions:

        tx_other, gn_other = i.fields[9].split("||")
        tx_id, gene_id = i.fields[3].split("||")
        if gene_id != gn_other:
            overlapping_tx[tx_id] += [tx_other]

    if bool:
        for k, _ in overlapping_tx.items():
            if not len(overlapping_tx[k]):
                overlapping_tx[k] = "0"
            else:
                overlapping_tx[k] = "1"

    if not invert_match:

        if not annotate_gtf:
            value = ",".join(set(overlapping_tx.keys()))
            gtf.select_by_key("transcript_id",
                              value).write(outputfile,
                                           gc_off=True)
        else:

            if len(overlapping_tx):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=overlapping_tx,
                                             new_key=key_name)
            gtf.write(outputfile,
                      gc_off=True)

    else:
        values = ",".join(set(overlapping_tx.keys()))
        gtf.select_by_key("transcript_id",
                          values,
                          invert_match).write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Esempio n. 9
0
def get_tx_seq(inputfile=None,
               outputfile=None,
               genome=None,
               with_introns=False,
               delete_version=False,
               del_chr=False,
               separator="",
               no_rev_comp=False,
               label="",
               sleuth_format=True,
               explicit=True,
               assembly="bla"):
    """
    Description: Get transcripts sequences in fasta format from a GTF file.
    """

    # -----------------------------------------------------------
    #  Check chromosomes in fasta file
    # -----------------------------------------------------------

    genome_chr_list = []

    message("%d fasta files found." % len(genome))

    as_gz_ext = [True for x in genome if x.name.endswith(".gz")]

    if any(as_gz_ext):
        message("Genome in gz format is not currently supported.",
                type="ERROR")

    if len(genome) == 1:
        message("Checking fasta file chromosome list")
        genome = genome[0]
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]
    else:
        message("Merging fasta files")
        tmp_genome = make_tmp_file(prefix="genome", suffix=".fa")
        with tmp_genome as tg:
            for curr_file in genome:
                message("Merging %s" % curr_file.name)
                with curr_file as cf:
                    shutil.copyfileobj(cf, tg, 1024 * 1024 * 100)

        message("Checking fasta file chromosome list")
        genome = open(tmp_genome.name, "r")
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]

    rev_comp = not no_rev_comp

    message("Chromosomes in fasta file: " + ",".join(genome_chr_list))

    # -----------------------------------------------------------
    #  Read gtf
    # -----------------------------------------------------------

    gtf = GTF(inputfile)
    nb_tx_before = gtf.extract_data("transcript_id",
                                    as_list=True,
                                    no_na=True,
                                    nr=True)

    # -----------------------------------------------------------
    #  Select genes falling in chrom defined in the fasta file
    # -----------------------------------------------------------

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    message("Selecting chromosome defined in the fasta file")

    gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list))

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    if len(gtf) == 0:
        message("No genes were found on chromosomes defined in fasta file.",
                type="ERROR")

    nb_tx_after = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   no_na=True,
                                   nr=True)
    if len(nb_tx_after) != len(nb_tx_before):
        diff = list(set(nb_tx_before) - set(nb_tx_after))
        message("Some transcripts had"
                " no corresponding chromosome"
                " in the fasta file: " + ",".join(diff)[0:100] + "...")

    message("Using genome file: " + genome.name)
    message("Retrieving fasta sequences from " + genome.name)
    fasta_seq = gtf.get_sequences(genome=genome.name,
                                  intron=with_introns,
                                  rev_comp=rev_comp)

    tx_gtf = gtf.select_by_key("feature", "transcript")

    if sleuth_format:

        tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)
        gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)

        for i in fasta_seq:
            gene_id = i.gene_id
            transcript_id = i.transcript_id
            chrom = i.chrom

            gn_bio = gn_biotype[i.gene_id][0]
            tx_bio = tx_biotype[i.transcript_id][0]

            if delete_version:
                transcript_id = re.sub('\.[0-9]+$', '', transcript_id)
                gene_id = re.sub('\.[0-9]+$', '', gene_id)
            if del_chr:
                chrom = chrom.replace('chr', '')

            header = " ".join([
                transcript_id, ":".join([
                    "chromosome", assembly, chrom,
                    str(i.start),
                    str(i.end), "1"
                ]), "gene:" + gene_id, "gene_biotype:" + gn_bio,
                "transcript_biotype:" + tx_bio
            ])

            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")
    else:
        tx_info = tx_gtf.extract_data("transcript_id," + label,
                                      as_dict_of_lists=True,
                                      hide_undef=False)
        for i in fasta_seq:
            if not explicit:
                header = separator.join(tx_info[i.transcript_id])
            else:
                header = [
                    str(x[0]) + "=" + x[1]
                    for x in zip(label.split(","), tx_info[i.transcript_id])
                ]
                header = separator.join(header)
            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")

    gc.disable()
    close_properly(outputfile, inputfile)
Esempio n. 10
0
def select_by_key(inputfile=None,
                  outputfile=None,
                  key=None,
                  value=None,
                  invert_match=False,
                  file_with_values=None,
                  col=0,
                  select_transcripts=False,
                  select_genes=False,
                  select_exons=False,
                  select_cds=False,
                  select_start_codon=False,
                  bed_format=False,
                  log=False,
                  separator="|",
                  names="transcript_id"):
    """Select lines from a GTF file based on attributes and
    associated values.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcripts:
        key = "feature"
        value = "transcript"

    elif select_cds:
        key = "feature"
        value = "CDS"

    elif select_start_codon:
        key = "feature"
        value = "start_codon"

    elif select_genes:
        key = "feature"
        value = "gene"

    elif select_exons:
        key = "feature"
        value = "exon"

    elif file_with_values is None:
        if key is None or value is None:
            message(
                "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.",
                type="ERROR")

    elif file_with_values is not None:
        if key is None:
            message("Please set -k.", type="ERROR")
        if value is not None:
            message("The -f and -v arguments are mutually exclusive.",
                    type="ERROR")

    # ----------------------------------------------------------------------
    # Load file with value
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True)

    if log:
        feat_before = len(gtf)

    if not file_with_values:
        value_list = value.split(",")
        gtf = gtf.select_by_key(key, value, invert_match)
    else:
        value_list = []

        for line in file_with_values:
            cols = line.split("\t")
            value_list += [cols[col - 1]]
        file_with_values.close()
        file_with_values = open(file_with_values.name)

        gtf = gtf.select_by_key(key=key,
                                invert_match=invert_match,
                                file_with_values=file_with_values,
                                col=col)

    if log:

        not_found = list(set(value_list) - set(all_values))
        feat_after = len(gtf)
        pct = feat_after / feat_before * 100

        message("Number of features before selection: %d" % feat_before)
        message("Fraction of feature selected: %.2f%%" % pct)

        if len(not_found):
            nfj = ",".join(not_found)
            max_letter = min(len(nfj), 50)
            if len(nfj) > 50:
                etc = "..."
            else:
                etc = ""
            message("Values not found: [" + ",".join(not_found)[:max_letter] +
                    etc + "].")
        else:
            message("Values not found: [].")

    # ----------------------------------------------------------------------
    # Write GTF file
    # ----------------------------------------------------------------------

    if not bed_format:

        gtf.write(outputfile, gc_off=True)

    else:
        nb_tokens = len(names.split(","))
        keys = "seqid,start,end," + names + ",score,strand"
        nb_fields = len(keys.split(","))

        for i in gtf.extract_data_iter_list(keys, zero_based=True):
            outputfile.write("\t".join([
                i[0],
                i[1],
                i[2],
                separator.join(i[3:(3 + nb_tokens)]),
                i[nb_fields - 2],
                i[nb_fields - 1],
            ]) + "\n")

    close_properly(outputfile, inputfile)
Esempio n. 11
0
def feature_size(inputfile=None,
                 outputfile=None,
                 ft_type="transcript",
                 names="transcript_id",
                 key_name='feature_size',
                 separator="|",
                 bed=False):
    """
 Get the size and limits (start/end) of features enclosed in the GTF. If bed
 format is requested returns the limits zero-based half open and the size as a score.
 Otherwise output GTF file with 'feat_size' as a new key and size as value.
    """

    message("Computing feature sizes.")

    gtf = GTF(inputfile)

    feat_list = gtf.get_feature_list(nr=True) + ['mature_rna']

    if ft_type not in feat_list + ["*"]:
        message("Unable to find requested feature.", type="ERROR")

    names = names.split(",")

    if ft_type != 'mature_rna':

        if bed:
            bed_obj = gtf.select_by_key("feature",
                                        ft_type).to_bed(name=names,
                                                        sep=separator,
                                                        add_feature_type=True)

            for i in bed_obj:
                i.score = str(i.end - i.start)
                write_properly(chomp(str(i)), outputfile)
        else:

            tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt")

            elmt = gtf.extract_data("feature,start,end",
                                    as_list_of_list=True,
                                    no_na=False,
                                    hide_undef=False)

            for i in elmt:
                if i[0] != ft_type and ft_type != "*":
                    tmp_file.write("?\n")
                else:
                    tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n")

            tmp_file.close()

            gtf.add_attr_column(tmp_file, key_name).write(outputfile,
                                                          gc_off=True)

    else:

        tx_size = gtf.get_transcript_size()

        if bed:
            bed_obj = gtf.select_by_key("feature", 'transcript').to_bed(
                ['transcript_id'] + names,
                add_feature_type=False,
                sep=separator,
                more_name=['mature_rna'])

            for i in bed_obj:
                names = i.name.split(separator)
                tx_id = names.pop(0)
                i.score = tx_size[tx_id]
                i.name = separator.join(names)
                write_properly(chomp(str(i)), outputfile)
        else:

            if len(tx_size):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=tx_size,
                                             new_key=key_name)

            gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)