def nb_transcripts(inputfile=None, outputfile=None, text_format=False, key_name=""): """ Compute the number of transcript per gene. """ gtf = GTF(inputfile) message("Computing the number of transcript per gene in input GTF file.") # Computation of transcript number is performed on exon lines # Just in case some transcript lines would be lacking (but they should # not...) n_tx = gtf.get_gn_to_tx() if not text_format: tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt") for i in n_tx: if not text_format: tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n") else: outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n") if not text_format: tmp_file.close() gtf.add_attr_from_file(feat="gene", key="gene_id", new_key=key_name, inputfile=tmp_file.name).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def tss_numbering(inputfile=None, outputfile=None, compute_dist=False, key_name='tss_number', key_name_dist='dist_to_first_tss', add_nb_tss_to_gene=False, gene_key='nb_tss'): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id"], as_dict=True) tx_to_gn = gtf.get_tx_to_gn() for k in tss: gn_id = tx_to_gn[k] gn_tss_dist[gn_id][k] = int(tss[k]) # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict # that maps gene_id to transcript_id and transcript_id to TSS # numbering (1 for most 5', then 2...). For transcripts having # the same TSSs, the tss number will be the same. gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Numbering TSSs.") tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt') gn_how_many_tss = dict() for gn_id in gn_to_tx_to_tss: for tx_id in gn_to_tx_to_tss[gn_id]: tss_num = str(gn_to_tx_to_tss[gn_id][tx_id]) tss_number_file.write(tx_id + "\t" + tss_num + "\n") if gn_id not in gn_how_many_tss: gn_how_many_tss[gn_id] = tss_num else: if int(tss_num) > int(gn_how_many_tss[gn_id]): gn_how_many_tss[gn_id] = tss_num tss_number_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name, inputfile=open(tss_number_file.name), has_header=False) if add_nb_tss_to_gene: gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss', suffix='.txt') for a_key, a_val in gn_how_many_tss.items(): gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n") gn_how_many_tss_file.close() gtf = gtf.add_attr_from_file(feat='gene', key='gene_id', new_key=gene_key, inputfile=open(gn_how_many_tss_file.name), has_header=False) if compute_dist: gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True) tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss', suffix='.txt') for gn_id in gn_to_tx_to_tss: tx_list = gn_to_tx_ordered_by_tss[gn_id] tx_first = tx_list.pop(0) # The first tss as distance 0 to the # first tss... tss_dist_file.write(tx_first + "\t0\n") for tx_id in tx_list: dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id])) tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n") tss_dist_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name_dist, inputfile=open(tss_dist_file.name), has_header=False) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def join_attr(inputfile=None, outputfile=None, join_file=None, has_header=False, new_key=None, target_feature=None, key_to_join=None, matrix=None): """ Join attributes from a tabulated file. """ # ----------------------------------------------------------- # Check argument consistency # ----------------------------------------------------------- if matrix is True: if new_key is not None: message("--new-key and --matrix are mutually exclusive.", type="ERROR") else: if new_key is None: message("--new-key is required when --matrix is False.", type="ERROR") # ----------------------------------------------------------- # load the GTF # ----------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ----------------------------------------------------------- # Check target feature # ----------------------------------------------------------- feat_list = gtf.get_feature_list(nr=True) if target_feature is not None: target_feature_list = target_feature.split(",") for i in target_feature_list: if i not in feat_list + ["*"]: message("Feature " + i + " not found.", type="ERROR") else: target_feature = ",".join(feat_list) # ----------------------------------------------------------- # Do it # ----------------------------------------------------------- if not matrix: gtf = gtf.add_attr_from_file(feat=target_feature, key=key_to_join, new_key=new_key, inputfile=join_file.name, has_header=has_header) gtf.write(outputfile, gc_off=True) else: gtf = gtf.add_attr_from_matrix_file(feat=target_feature, key=key_to_join, inputfile=join_file.name) gtf.write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)