def nb_transcripts(inputfile=None, outputfile=None, text_format=False, key_name=""): """ Compute the number of transcript per gene. """ gtf = GTF(inputfile) message("Computing the number of transcript per gene in input GTF file.") # Computation of transcript number is performed on exon lines # Just in case some transcript lines would be lacking (but they should # not...) n_tx = gtf.get_gn_to_tx() if not text_format: tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt") for i in n_tx: if not text_format: tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n") else: outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n") if not text_format: tmp_file.close() gtf.add_attr_from_file(feat="gene", key="gene_id", new_key=key_name, inputfile=tmp_file.name).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def tss_dist(inputfile=None, outputfile=None): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id", "gene_id"], as_dict=True) for k in tss: tx_id, gn_id = k.split("|") gn_tss_dist[gn_id][tx_id] = int(tss[k]) gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Computing distances.") outputfile.write("\t".join([ "gene_id", "transcript_id_1", "transcript_id_2", "dist", "tss_num_1", "tss_num_2" ]) + "\n") try: for gn_id in sorted(gn_tss_dist.keys()): tx_list = sorted(list(gn_tss_dist[gn_id].keys())) for i in range(len(tx_list) - 1): for j in range(i + 1, len(tx_list)): dist = str( abs(gn_tss_dist[gn_id][tx_list[i]] - gn_tss_dist[gn_id][tx_list[j]])) tss_1 = gn_to_tx_to_tss[gn_id][tx_list[i]] tss_2 = gn_to_tx_to_tss[gn_id][tx_list[j]] if tss_1 < tss_2: str_out = "\t".join([ gn_id, tx_list[i], tx_list[j], dist, str(tss_1), str(tss_2) ]) + "\n" outputfile.write(str_out) else: str_out = "\t".join([ gn_id, tx_list[j], tx_list[i], dist, str(tss_2), str(tss_1) ]) + "\n" outputfile.write(str_out) except (BrokenPipeError, IOError): def _void_f(*args, **kwargs): pass message("Received a boken pipe signal", type="WARNING") sys.stdout.write = _void_f sys.stdout.flush = _void_f close_properly(outputfile, inputfile)
def tss_numbering(inputfile=None, outputfile=None, compute_dist=False, key_name='tss_number', key_name_dist='dist_to_first_tss', add_nb_tss_to_gene=False, gene_key='nb_tss'): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id"], as_dict=True) tx_to_gn = gtf.get_tx_to_gn() for k in tss: gn_id = tx_to_gn[k] gn_tss_dist[gn_id][k] = int(tss[k]) # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict # that maps gene_id to transcript_id and transcript_id to TSS # numbering (1 for most 5', then 2...). For transcripts having # the same TSSs, the tss number will be the same. gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Numbering TSSs.") tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt') gn_how_many_tss = dict() for gn_id in gn_to_tx_to_tss: for tx_id in gn_to_tx_to_tss[gn_id]: tss_num = str(gn_to_tx_to_tss[gn_id][tx_id]) tss_number_file.write(tx_id + "\t" + tss_num + "\n") if gn_id not in gn_how_many_tss: gn_how_many_tss[gn_id] = tss_num else: if int(tss_num) > int(gn_how_many_tss[gn_id]): gn_how_many_tss[gn_id] = tss_num tss_number_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name, inputfile=open(tss_number_file.name), has_header=False) if add_nb_tss_to_gene: gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss', suffix='.txt') for a_key, a_val in gn_how_many_tss.items(): gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n") gn_how_many_tss_file.close() gtf = gtf.add_attr_from_file(feat='gene', key='gene_id', new_key=gene_key, inputfile=open(gn_how_many_tss_file.name), has_header=False) if compute_dist: gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True) tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss', suffix='.txt') for gn_id in gn_to_tx_to_tss: tx_list = gn_to_tx_ordered_by_tss[gn_id] tx_first = tx_list.pop(0) # The first tss as distance 0 to the # first tss... tss_dist_file.write(tx_first + "\t0\n") for tx_id in tx_list: dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id])) tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n") tss_dist_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name_dist, inputfile=open(tss_dist_file.name), has_header=False) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)