def select_by_max_exon_nb(inputfile=None, outputfile=None): """ Select transcripts based on the number of exons. """ msg = "Selecting transcript with the highest number of exon for each gene." message(msg) gtf = GTF(inputfile, check_ensembl_format=False).select_by_max_exon_nb() gtf.write(outputfile, gc_off=True)
def select_most_5p_tx(inputfile=None, outputfile=None, keep_gene_lines=False): """ Select the most 5' transcript of each gene. """ message("Selecting the most 5' transcript of each gene.") gtf = GTF(inputfile) if keep_gene_lines: gtf = gtf.select_5p_transcript() else: gtf = gtf.select_5p_transcript().select_by_key("feature", "gene", 1) gtf.write(outputfile, gc_off=True)
def select_by_nb_exon(inputfile=None, outputfile=None, min_exon_number=None, max_exon_number=None): """ Select transcripts based on the number of exons. """ msg = "Selecting transcript by exon number (range: [{m},{M}])" msg = msg.format(m=str(min_exon_number), M=str(max_exon_number)) message(msg) gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons( min_exon_number, max_exon_number) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def exon_sizes(inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of exon-size. """ gtf = GTF(inputfile) all_tx_ids = gtf.get_tx_ids(nr=True) tx_to_size_list = dict() exons_starts = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,start", as_dict_of_merged_list=True, no_na=True, nr=False) if not len(exons_starts): message("No exon found.", type="ERROR") exons_ends = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False) strands = gtf.select_by_key("feature", "transcript").extract_data( "transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) for tx_id in all_tx_ids: size_list = [] for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]): size = str(int(e) - int(s) + 1) size_list += [size] if strands[tx_id] == "-": size_list = reversed(size_list) tx_to_size_list[tx_id] = ",".join(size_list) if len(tx_to_size_list): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_size_list, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def short_long(inputfile=None, outputfile=None, longs=None, keep_gene_lines=False): """ Select the shortest transcript for each gene, Or the longuest if the \ -l arguments is used. """ gtf = GTF(inputfile, check_ensembl_format=False) if longs: gtf = gtf.select_longuest_transcripts() else: gtf = gtf.select_shortest_transcripts() if not keep_gene_lines: gtf = gtf.select_by_key("feature", "gene", 1) gtf.write(outputfile, gc_off=True)
def join_multi_file(inputfile=None, outputfile=None, target_feature=None, key_to_join=None, matrix_files=()): """ Join attributes from a set of tabulated files. """ # ----------------------------------------------------------- # load the GTF # ----------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ----------------------------------------------------------- # Check target feature # ----------------------------------------------------------- feat_list = gtf.get_feature_list(nr=True) if target_feature is not None: target_feature_list = target_feature.split(",") for i in target_feature_list: if i not in feat_list + ["*"]: message("Feature " + i + " not found.", type="ERROR") else: target_feature = ",".join(feat_list) # ----------------------------------------------------------- # Do it # ----------------------------------------------------------- for join_file in matrix_files: gtf = gtf.add_attr_from_matrix_file(feat=target_feature, key=key_to_join, inputfile=join_file.name) gtf.write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def nb_exons(inputfile=None, outputfile=None, key_name=None, text_format=False): """ Count the number of exons in the gtf file. """ gtf = GTF(inputfile) n_exons = defaultdict(int) # ------------------------------------------------------------------------- # Computing number of exon for each transcript in input GTF file # # ------------------------------------------------------------------------- message("Computing number of exons for each transcript in input GTF file.") exon = gtf.select_by_key("feature", "exon") fields = exon.extract_data("transcript_id") for i in fields: tx_id = i[0] n_exons[tx_id] += 1 if text_format: for tx_id in n_exons: outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) + "\ttranscript\n") else: if len(n_exons): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=n_exons, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def overlapping( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, feature_type='transcript', same_strandedness=False, diff_strandedness=False, annotate_gtf=False, bool=False, annotate_all=False, invert_match=False): """ Description: Find transcripts whose body/TSS/TTS do or do not overlap with any transcript from another gene. """ # ---------------------------------------------------------------------- # Prepare key names # ---------------------------------------------------------------------- if annotate_gtf: if key_name is None: key_info = ["overlap", feature_type, "u" + str(upstream / 1000) + "k", "d" + str(downstream / 1000) + "k" ] key_name = "_".join(key_info) if invert_match: message("--annotate-gtf and --invert-match are " "mutually exclusive.", type="ERROR") if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") message("Using -u " + str(upstream)) message("Using -d " + str(downstream)) overlapping_tx = defaultdict(list) # Load the GTF so that it won't be lost # if GTF stream comes from stdin gtf = GTF(inputfile) message("Getting transcript in bed format") tx_feat = gtf.select_by_key("feature", "transcript") if annotate_all: overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0") for i in overlapping_tx: overlapping_tx[i] = [] # ---------------------------------------------------------------------- # Get transcript limits # ---------------------------------------------------------------------- tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||") message("Getting " + feature_type + " and 'slopping'.") if feature_type == "transcript": bed_obj = tx_bed.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "promoter": bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "tts": bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) else: message("Not implemented yet", type="ERROR") tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed") bed_obj.saveas(tmp_file.name) overlap_regions = bed_obj.intersect(tx_bed, wb=True, s=same_strandedness, S=diff_strandedness) tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed") overlap_regions.saveas(tmp_file.name) for i in overlap_regions: tx_other, gn_other = i.fields[9].split("||") tx_id, gene_id = i.fields[3].split("||") if gene_id != gn_other: overlapping_tx[tx_id] += [tx_other] if bool: for k, _ in overlapping_tx.items(): if not len(overlapping_tx[k]): overlapping_tx[k] = "0" else: overlapping_tx[k] = "1" if not invert_match: if not annotate_gtf: value = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", value).write(outputfile, gc_off=True) else: if len(overlapping_tx): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=overlapping_tx, new_key=key_name) gtf.write(outputfile, gc_off=True) else: values = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", values, invert_match).write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def retrieve(species_name='homo_sapiens', outputfile=None, release=None, to_stdout=False, list_only=False, delete=False, hide_species_name=None, ensembl_collection='vertebrate'): """Retrieve a GTF file from ensembl. :Example: >>> # retrieve("Xenopus_tropicalis") """ if outputfile is None: outputdir = os.getcwd() else: outputdir = os.path.dirname(os.path.abspath(outputfile.name)) if species_name is None and not list_only: message("Choose --species-name or --list-only.", type='ERROR') if outputfile is not None: if not os.path.exists( os.path.dirname(os.path.abspath(outputfile.name))): message("Output directory does not exists. Exiting.", type='ERROR') else: if os.path.isdir(outputfile.name): message("Output file is a directory !.", type='ERROR') # Will contain the url pointing to the # requested gtf. target_gtf = None # ------------------------------------------------------------------------- # Check ensembl repository # ------------------------------------------------------------------------- if ensembl_collection == 'vertebrate': host = "ftp.ensembl.org" user = "******" # votre identifiant password = "******" elif ensembl_collection in ['protists', 'fungi', 'plants', 'metazoa']: host = "ftp.ensemblgenomes.org" user = "******" password = "******" try: message("Trying to connect") ftp = ftputil.FTPHost(host, user, password) if pygtftk.utils.VERBOSITY: message("Connected to ensembl FTP website.") except FTPOSError as err: message(str(err)) message("Unable to connect (FTPOSError).", type="ERROR") message("Connection successful.") try: ftp.chdir('/pub') message("Successfully change directory to pub") except: message("Unable to change directory to 'pub'.", type="ERROR") if ensembl_collection in ['protists', 'fungi', 'plants', 'metazoa']: try: ftp.chdir(ensembl_collection) message("Successfully change directory to " + ensembl_collection) except: message("Unable to change directory to '%s'." % ensembl_collection, type="ERROR") try: all_releases = ftp.listdir(ftp.curdir) except Exception as e: print(str(e)) message("Unable to list directory.", type="ERROR") if release is not None: release_dir = "release-" + release if release_dir not in all_releases: message("This release number could not be found. Aborting", type="ERROR") else: version_list = [] for ver in all_releases: regexp = re.compile("release-(\d+)") hit = regexp.search(ver) if hit: version_list += [int(hit.group(1))] release = max(version_list) release_dir = "release-" + str(release) message("Latest version is %d." % release) try: ftp.chdir(release_dir) message("Changed release directory: %s" % release_dir, type="DEBUG") except: message("Unable to change directory to '%s'." % release_dir, type="ERROR") ftp.chdir('gtf') try: all_species = ftp.listdir(ftp.curdir) all_species = [x for x in all_species if ftp.path.isdir(x)] except: message("Unable to list directory.", type="ERROR") if list_only: species_list = [] url_list = [] for sp in all_species: gtfs = [x for x in ftp.listdir(sp) if x.endswith('.gtf.gz')] for gtf in gtfs: species_list += [sp] current_url = 'ftp://' + host + ftp.getcwd() + '/' url_list += [current_url + sp + "/" + gtf] for sp, url in zip(species_list, url_list): if hide_species_name: print(url) else: print(sp.ljust(50) + url) sys.exit() else: if species_name not in all_species: message("Species could not be found for release: %s" % str(release)) message("Trying species name in lower case.") species_name = species_name.lower() if species_name not in all_species: message("Species could not be found for release: %s" % str(release), type="ERROR") ftp.chdir(species_name) gtf_list = ftp.listdir(ftp.curdir) # choice 1 (only regular chromosome) gtf_sub = [x for x in gtf_list if x.endswith("chr.gtf.gz")] # choice 2 should be ! choice 1 and ! 'ab_initio'. # Should be default gtf gtf_sub_2 = [x for x in gtf_list if "abinitio.gtf.gz" not in x] gtf_sub_2 = [x for x in gtf_sub_2 if x.endswith(".gtf.gz")] if gtf_sub: gtf_sub_2.remove(gtf_sub[0]) # Choice 3 abinitio gtf_sub_3 = [x for x in gtf_list if x.endswith("abinitio.gtf.gz")] # Choice 4: # Any gtf if len(gtf_sub) > 0: target_gtf = gtf_sub[0] elif len(gtf_sub_2) > 0: target_gtf = gtf_sub_2[0] elif len(gtf_sub_3) > 0: target_gtf = gtf_sub_3[0] else: gtf_sub = [x for x in gtf_list if x.endswith(".gtf.gz")] target_gtf = gtf_sub[0] # ------------------------------------------------------------------------- # Download if requested # ------------------------------------------------------------------------- if target_gtf is not None: if not list_only: message("Downloading GTF file : " + target_gtf) ftp.download(target_gtf, target_gtf) os.rename(target_gtf, os.path.join(outputdir, target_gtf)) if to_stdout: gtf = GTF(os.path.join(outputdir, target_gtf), check_ensembl_format=False) gtf.write("-", gc_off=True) if delete: os.remove(os.path.join(outputdir, target_gtf)) else: if outputfile is not None: message("Renaming.") os.rename(os.path.join(outputdir, target_gtf), outputfile.name) else: message("Species could not be found for release: " + release, type='ERROR') gc.disable()
def select_by_intron_size(inputfile=None, outputfile=None, intron_size=0, merged=False, invert_match=False, delete_monoexonic=False, add_intron_size=False): """ Select genes which contain an intron of size at least s or whose sum of intron size is at least s """ message("Searching for intronic regions.") gtf = GTF(inputfile, check_ensembl_format=False) introns_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False).sort() # Get the list of transcripts all_tx_ids = gtf.get_tx_ids(nr=True) # The list of transcripts # to be deleted to_delete = OrderedDict() if merged: # Create a dict that will contain the sum of introns for # each transcript intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0) for i in introns_bo: size = i.end - i.start tx_id = i.name intron_sum_dict[tx_id] += size for tx_id, sum_intron in list(intron_sum_dict.items()): if sum_intron != 0: if not invert_match: if sum_intron < intron_size: to_delete[tx_id] = 1 else: if sum_intron >= intron_size: to_delete[tx_id] = 1 else: if delete_monoexonic: to_delete[tx_id] = 1 if add_intron_size: gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_sum_dict, new_key="intron_size_sum") else: # Create a dict that will contain a list introns size # for each transcript intron_size_dict = defaultdict(list) for tx_id in all_tx_ids: intron_size_dict[tx_id] = [] for i in introns_bo: size = i.end - i.start tx_id = i.name intron_size_dict[tx_id] += [size] for tx_id, list_size in list(intron_size_dict.items()): if not list_size: intron_size_dict[tx_id] = [0] if delete_monoexonic: to_delete[tx_id] = 1 continue for size in intron_size_dict[tx_id]: if not invert_match: if size < intron_size: to_delete[tx_id] = 1 else: if size >= intron_size: to_delete[tx_id] = 1 if add_intron_size: for tx_id, list_size in list(intron_size_dict.items()): list_size = [str(x) for x in list_size] intron_size_dict[tx_id] = ",".join(list_size) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size_dict, new_key="intron_size") all_tx_ids = gtf.get_tx_ids(nr=True) all_tx_ids = [x for x in all_tx_ids if x not in to_delete] msg_list = ",".join(list(to_delete.keys())) nb_char = min([len(msg_list), 40]) msg_list = msg_list[0:nb_char] message("Deleting: " + msg_list + "...") gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids)) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_key(inputfile=None, outputfile=None, key=None, value=None, invert_match=False, file_with_values=None, col=0, select_transcripts=False, select_genes=False, select_exons=False, select_cds=False, select_start_codon=False, bed_format=False, log=False, separator="|", names="transcript_id"): """Select lines from a GTF file based on attributes and associated values. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcripts: key = "feature" value = "transcript" elif select_cds: key = "feature" value = "CDS" elif select_start_codon: key = "feature" value = "start_codon" elif select_genes: key = "feature" value = "gene" elif select_exons: key = "feature" value = "exon" elif file_with_values is None: if key is None or value is None: message( "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.", type="ERROR") elif file_with_values is not None: if key is None: message("Please set -k.", type="ERROR") if value is not None: message("The -f and -v arguments are mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # Load file with value # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True) if log: feat_before = len(gtf) if not file_with_values: value_list = value.split(",") gtf = gtf.select_by_key(key, value, invert_match) else: value_list = [] for line in file_with_values: cols = line.split("\t") value_list += [cols[col - 1]] file_with_values.close() file_with_values = open(file_with_values.name) gtf = gtf.select_by_key(key=key, invert_match=invert_match, file_with_values=file_with_values, col=col) if log: not_found = list(set(value_list) - set(all_values)) feat_after = len(gtf) pct = feat_after / feat_before * 100 message("Number of features before selection: %d" % feat_before) message("Fraction of feature selected: %.2f%%" % pct) if len(not_found): nfj = ",".join(not_found) max_letter = min(len(nfj), 50) if len(nfj) > 50: etc = "..." else: etc = "" message("Values not found: [" + ",".join(not_found)[:max_letter] + etc + "].") else: message("Values not found: [].") # ---------------------------------------------------------------------- # Write GTF file # ---------------------------------------------------------------------- if not bed_format: gtf.write(outputfile, gc_off=True) else: nb_tokens = len(names.split(",")) keys = "seqid,start,end," + names + ",score,strand" nb_fields = len(keys.split(",")) for i in gtf.extract_data_iter_list(keys, zero_based=True): outputfile.write("\t".join([ i[0], i[1], i[2], separator.join(i[3:(3 + nb_tokens)]), i[nb_fields - 2], i[nb_fields - 1], ]) + "\n") close_properly(outputfile, inputfile)
def closest_genes( inputfile=None, outputfile=None, from_region_type=None, no_header=False, nb_neighbors=1, to_region_type=None, same_strandedness=False, diff_strandedness=False, text_format=False, identifier="gene_id", collapse=False): """ Find the n closest genes for each gene. """ if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # load GTF # ---------------------------------------------------------------------- gtf = GTF(inputfile) gn_gtf = gtf.select_by_key("feature", "gene") gn_ids = gn_gtf.get_gn_ids(nr=True) if len(gn_gtf) == 0: message("No gene feature found. Please use convert_ensembl.", type="ERROR") if nb_neighbors >= (len(gn_gtf) - 1): message("Two much neighbors", type="ERROR") all_ids = gn_gtf.extract_data(identifier, as_list=True, no_na=False) if "." in all_ids: message("Some identifiers are undefined ('.').", type="ERROR") if len(all_ids) == 0: message("The identifier was not found.", type="ERROR") # ---------------------------------------------------------------------- # load GTF and requested regions (for source/'from' transcript) # ---------------------------------------------------------------------- if from_region_type == 'tss': from_regions = gn_gtf.get_5p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif from_region_type == 'tts': from_regions = gn_gtf.get_3p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif from_region_type == 'gene': from_regions = gn_gtf.to_bed(name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() else: message("Unknown type.", type="ERROR") # ---------------------------------------------------------------------- # load GTF and requested regions (for dest/'to' transcript) # ---------------------------------------------------------------------- if to_region_type == 'tss': to_regions = gn_gtf.get_5p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif to_region_type == 'tts': to_regions = gn_gtf.get_3p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif to_region_type == 'gene': to_regions = gn_gtf.to_bed(name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() else: message("Unknown type.", type="ERROR") # ---------------------------------------------------------------------- # Search closest genes # ---------------------------------------------------------------------- gene_closest = defaultdict(list) gene_closest_dist = defaultdict(list) closest_bo = from_regions.closest(b=to_regions, k=nb_neighbors, N=True, s=same_strandedness, S=diff_strandedness, d=True) for i in closest_bo: gene_closest[i[3]] += [i[9]] gene_closest_dist[i[3]] += [i[12]] if not text_format: if len(gene_closest): gtf = gtf.add_attr_from_dict(feat="gene", key=identifier, a_dict=gene_closest, new_key="closest_gn") gtf = gtf.add_attr_from_dict(feat="gene", key=identifier, a_dict=gene_closest_dist, new_key="closest_dist") gtf.write(outputfile, gc_off=True) else: if not no_header: outputfile.write("genes\tclosest_genes\tdistances\n") for gene in gn_ids: if not collapse: outputfile.write("\t".join([gene, ",".join(gene_closest[gene]), ",".join(gene_closest_dist[gene])]) + "\n") else: for closest, dist in zip(gene_closest[gene], gene_closest_dist[gene]): outputfile.write("\t".join([gene, closest, dist]) + "\n") gc.disable() close_properly(outputfile, inputfile)
def convergent(inputfile=None, outputfile=None, upstream=1500, downstream=1500, chrom_info=None): """ Find transcript with convergent tts. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_to_convergent_nm = dict() dist_to_convergent = dict() tts_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tts coordinates.") tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||") # get tts position for i in tts_bo: tx_id_ov, gn_id_ov = i.name.split("||") tts_pos[tx_id_ov] = int(i.start) message("Getting tts coordinates.") tts_region_bo = tts_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") tts_intersect_bo = tts_region_bo.intersect(tts_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("tts_slop", ".bed") tts_region_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed") tts_intersect_bo.saveas(tmp_file.name) for i in tts_intersect_bo: tx_id_main, gene_id_main = i.fields[3].split("||") tx_id_ov, gn_id_ov = i.fields[9].split("||") if gene_id_main != gn_id_ov: if tx_id_main in tx_to_convergent_nm: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) if dist < dist_to_convergent[tx_id_main]: dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov else: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov if len(tx_to_convergent_nm): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_convergent_nm, new_key="convergent") gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_convergent, new_key="dist_to_convergent") gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def tss_numbering(inputfile=None, outputfile=None, compute_dist=False, key_name='tss_number', key_name_dist='dist_to_first_tss', add_nb_tss_to_gene=False, gene_key='nb_tss'): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id"], as_dict=True) tx_to_gn = gtf.get_tx_to_gn() for k in tss: gn_id = tx_to_gn[k] gn_tss_dist[gn_id][k] = int(tss[k]) # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict # that maps gene_id to transcript_id and transcript_id to TSS # numbering (1 for most 5', then 2...). For transcripts having # the same TSSs, the tss number will be the same. gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Numbering TSSs.") tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt') gn_how_many_tss = dict() for gn_id in gn_to_tx_to_tss: for tx_id in gn_to_tx_to_tss[gn_id]: tss_num = str(gn_to_tx_to_tss[gn_id][tx_id]) tss_number_file.write(tx_id + "\t" + tss_num + "\n") if gn_id not in gn_how_many_tss: gn_how_many_tss[gn_id] = tss_num else: if int(tss_num) > int(gn_how_many_tss[gn_id]): gn_how_many_tss[gn_id] = tss_num tss_number_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name, inputfile=open(tss_number_file.name), has_header=False) if add_nb_tss_to_gene: gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss', suffix='.txt') for a_key, a_val in gn_how_many_tss.items(): gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n") gn_how_many_tss_file.close() gtf = gtf.add_attr_from_file(feat='gene', key='gene_id', new_key=gene_key, inputfile=open(gn_how_many_tss_file.name), has_header=False) if compute_dist: gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True) tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss', suffix='.txt') for gn_id in gn_to_tx_to_tss: tx_list = gn_to_tx_ordered_by_tss[gn_id] tx_first = tx_list.pop(0) # The first tss as distance 0 to the # first tss... tss_dist_file.write(tx_first + "\t0\n") for tx_id in tx_list: dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id])) tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n") tss_dist_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name_dist, inputfile=open(tss_dist_file.name), has_header=False) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def divergent( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, no_strandness=False, no_annotation=False): """ Find transcript with divergent promoters. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_with_divergent = dict() dist_to_divergent = dict() tss_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tss coordinates.") tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||") # get tss position for i in tss_bo: tx_id_tss, gn_id_tss = i.name.split("||") tss_pos[tx_id_tss] = int(i.start) message("Getting promoter coordinates.") promoter_bo = tss_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") if no_strandness: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=False) else: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("promoter_slop", ".bed") promoter_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed") prom_with_tss_bo.saveas(tmp_file.name) for i in prom_with_tss_bo: tx_id_tss, gn_id_tss = i.fields[9].split("||") tx_id_prom, gene_id_prom = i.fields[3].split("||") if gene_id_prom != gn_id_tss: if tx_id_prom in tx_with_divergent: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) if dist < dist_to_divergent[tx_id_prom]: dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss else: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss if not no_annotation: if key_name is None: key_name = "divergent" key_name_dist = "dist_to_divergent" else: key_name_dist = "dist_" + key_name if len(tx_with_divergent): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_with_divergent, new_key=key_name) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_divergent, new_key=key_name_dist) gtf.write(outputfile, gc_off=True) else: gtf.select_by_key("transcript_id", ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def join_attr(inputfile=None, outputfile=None, join_file=None, has_header=False, new_key=None, target_feature=None, key_to_join=None, matrix=None): """ Join attributes from a tabulated file. """ # ----------------------------------------------------------- # Check argument consistency # ----------------------------------------------------------- if matrix is True: if new_key is not None: message("--new-key and --matrix are mutually exclusive.", type="ERROR") else: if new_key is None: message("--new-key is required when --matrix is False.", type="ERROR") # ----------------------------------------------------------- # load the GTF # ----------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ----------------------------------------------------------- # Check target feature # ----------------------------------------------------------- feat_list = gtf.get_feature_list(nr=True) if target_feature is not None: target_feature_list = target_feature.split(",") for i in target_feature_list: if i not in feat_list + ["*"]: message("Feature " + i + " not found.", type="ERROR") else: target_feature = ",".join(feat_list) # ----------------------------------------------------------- # Do it # ----------------------------------------------------------- if not matrix: gtf = gtf.add_attr_from_file(feat=target_feature, key=key_to_join, new_key=new_key, inputfile=join_file.name, has_header=has_header) gtf.write(outputfile, gc_off=True) else: gtf = gtf.add_attr_from_matrix_file(feat=target_feature, key=key_to_join, inputfile=join_file.name) gtf.write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)