Ejemplo n.º 1
0
def nb_transcripts(inputfile=None,
                   outputfile=None,
                   text_format=False,
                   key_name=""):
    """
    Compute the number of transcript per gene.
    """

    gtf = GTF(inputfile)

    message("Computing the number of transcript per gene in input GTF file.")

    # Computation of transcript number is performed on exon lines
    # Just in case some transcript lines would be lacking (but they should
    # not...)

    n_tx = gtf.get_gn_to_tx()

    if not text_format:
        tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt")

    for i in n_tx:
        if not text_format:
            tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n")

    if not text_format:
        tmp_file.close()
        gtf.add_attr_from_file(feat="gene",
                               key="gene_id",
                               new_key=key_name,
                               inputfile=tmp_file.name).write(outputfile,
                                                              gc_off=True)

    close_properly(outputfile, inputfile)
Ejemplo n.º 2
0
def tss_dist(inputfile=None, outputfile=None):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id", "gene_id"], as_dict=True)

    for k in tss:
        tx_id, gn_id = k.split("|")
        gn_tss_dist[gn_id][tx_id] = int(tss[k])

    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Computing distances.")

    outputfile.write("\t".join([
        "gene_id", "transcript_id_1", "transcript_id_2", "dist", "tss_num_1",
        "tss_num_2"
    ]) + "\n")
    try:
        for gn_id in sorted(gn_tss_dist.keys()):

            tx_list = sorted(list(gn_tss_dist[gn_id].keys()))

            for i in range(len(tx_list) - 1):

                for j in range(i + 1, len(tx_list)):
                    dist = str(
                        abs(gn_tss_dist[gn_id][tx_list[i]] -
                            gn_tss_dist[gn_id][tx_list[j]]))
                    tss_1 = gn_to_tx_to_tss[gn_id][tx_list[i]]
                    tss_2 = gn_to_tx_to_tss[gn_id][tx_list[j]]

                    if tss_1 < tss_2:
                        str_out = "\t".join([
                            gn_id, tx_list[i], tx_list[j], dist,
                            str(tss_1),
                            str(tss_2)
                        ]) + "\n"
                        outputfile.write(str_out)
                    else:
                        str_out = "\t".join([
                            gn_id, tx_list[j], tx_list[i], dist,
                            str(tss_2),
                            str(tss_1)
                        ]) + "\n"
                        outputfile.write(str_out)

    except (BrokenPipeError, IOError):

        def _void_f(*args, **kwargs):
            pass

        message("Received a boken pipe signal", type="WARNING")
        sys.stdout.write = _void_f
        sys.stdout.flush = _void_f

    close_properly(outputfile, inputfile)
Ejemplo n.º 3
0
def tss_numbering(inputfile=None,
                  outputfile=None,
                  compute_dist=False,
                  key_name='tss_number',
                  key_name_dist='dist_to_first_tss',
                  add_nb_tss_to_gene=False,
                  gene_key='nb_tss'):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Numbering TSSs.")

    tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')

    gn_how_many_tss = dict()

    for gn_id in gn_to_tx_to_tss:
        for tx_id in gn_to_tx_to_tss[gn_id]:
            tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
            tss_number_file.write(tx_id + "\t" + tss_num + "\n")
            if gn_id not in gn_how_many_tss:
                gn_how_many_tss[gn_id] = tss_num
            else:
                if int(tss_num) > int(gn_how_many_tss[gn_id]):
                    gn_how_many_tss[gn_id] = tss_num

    tss_number_file.close()

    gtf = gtf.add_attr_from_file(feat='transcript',
                                 key='transcript_id',
                                 new_key=key_name,
                                 inputfile=open(tss_number_file.name),
                                 has_header=False)

    if add_nb_tss_to_gene:

        gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss',
                                             suffix='.txt')

        for a_key, a_val in gn_how_many_tss.items():
            gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n")

        gn_how_many_tss_file.close()

        gtf = gtf.add_attr_from_file(feat='gene',
                                     key='gene_id',
                                     new_key=gene_key,
                                     inputfile=open(gn_how_many_tss_file.name),
                                     has_header=False)

    if compute_dist:
        gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True)
        tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss',
                                      suffix='.txt')

        for gn_id in gn_to_tx_to_tss:
            tx_list = gn_to_tx_ordered_by_tss[gn_id]
            tx_first = tx_list.pop(0)
            # The first tss as distance 0 to the
            # first tss...
            tss_dist_file.write(tx_first + "\t0\n")
            for tx_id in tx_list:
                dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id]))
                tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n")

        tss_dist_file.close()

        gtf = gtf.add_attr_from_file(feat='transcript',
                                     key='transcript_id',
                                     new_key=key_name_dist,
                                     inputfile=open(tss_dist_file.name),
                                     has_header=False)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)