def midpoints(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|"): """ Get the midpoint coordinates for the requested feature. """ message("Loading input file...") if inputfile.name == '<stdin>': is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': is_gtf = True else: is_gtf = False if is_gtf: gtf = GTF(inputfile.name, check_ensembl_format=False) bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints( name=names.split(","), sep=separator) for line in bed_obj: write_properly(chomp(str(line)), outputfile) else: for line in region_bo: diff = line.end - line.start if diff % 2 != 0: # e.g 10-13 (zero based) -> 11-13 one based # mipoint is 12 (one-based) -> 11-12 (zero based) # e.g 949-1100 (zero based) -> 950-1100 one based # mipoint is 1025 (one-based) -> 1024-1025 (zero based) # floored division (python 2)... line.end = line.start + int(diff // 2) + 1 line.start = line.end - 1 else: # e.g 10-14 (zero based) -> 11-14 one based # mipoint is 12-13 (one-based) -> 11-13 (zero based) # e.g 9-5100 (zero based) -> 10-5100 one based # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based) # floored division (python 2)... # No real center. Take both line.start = line.start + int(diff // 2) - 1 line.end = line.start + 2 outputfile.write(str(line)) gc.disable() close_properly(outputfile, inputfile)
def random_list(inputfile=None, outputfile=None, number=None, ft_type=None, seed_value=None): """ Select a random list of genes or transcripts. """ message("loading the GTF.") gtf = GTF(inputfile) message("Getting ID list.") if ft_type == 'gene': id_list = gtf.extract_data("gene_id", as_list=True, nr=True, hide_undef=True, no_na=True) else: id_list = gtf.extract_data("transcript_id", as_list=True, nr=True, hide_undef=True, no_na=True) if number > len(id_list): message("To much feature. Using : " + str(len(id_list)), type="WARNING") number = len(id_list) if seed_value is not None: random.seed(seed_value, version=1) id_list = random.sample(id_list, number) message("Printing.") my_id = ft_type + "_id" gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def exon_sizes(inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of exon-size. """ gtf = GTF(inputfile) all_tx_ids = gtf.get_tx_ids(nr=True) tx_to_size_list = dict() exons_starts = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,start", as_dict_of_merged_list=True, no_na=True, nr=False) if not len(exons_starts): message("No exon found.", type="ERROR") exons_ends = gtf.select_by_key("feature", "exon").extract_data( "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False) strands = gtf.select_by_key("feature", "transcript").extract_data( "transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) for tx_id in all_tx_ids: size_list = [] for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]): size = str(int(e) - int(s) + 1) size_list += [size] if strands[tx_id] == "-": size_list = reversed(size_list) tx_to_size_list[tx_id] = ",".join(size_list) if len(tx_to_size_list): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_size_list, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def random_tx(inputfile=None, outputfile=None, max_transcript=None, seed_value=None): """ Select randomly up to m transcript for each gene. """ message("loading the GTF.") gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True) message("Getting gene_id and transcript_id") gene2tx = gtf.extract_data("gene_id,transcript_id", as_dict_of_merged_list=True, no_na=True, nr=True) message("Selecting random transcript") if seed_value is not None: random.seed(seed_value, version=1) tx_to_delete = [] for gn_id in gene2tx: tx_list = gene2tx[gn_id] nb_tx = len(tx_list) max_cur = min(max_transcript, nb_tx) pos_to_keep = random.sample(list(range(len(tx_list))), max_cur) tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep] tx_to_delete += tx_list message("Printing results") message("Selecting transcript.") gtf.select_by_key("transcript_id", ",".join(tx_to_delete), invert_match=True).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_go(inputfile=None, outputfile=None, go_id=None, https_proxy=None, http_proxy=None, list_datasets=None, species=None, invert_match=False): """ Select lines from a GTF file based using a Gene Ontology ID (e.g GO:0050789). """ if not go_id.startswith("GO:"): go_id = "GO:" + go_id is_associated = OrderedDict() bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy) bm.get_datasets('ENSEMBL_MART_ENSEMBL') if list_datasets: for i in sorted(bm.datasets): write_properly(i.replace("_gene_ensembl", ""), outputfile) sys.exit() else: if species + "_gene_ensembl" not in bm.datasets: message("Unknow dataset/species.", type="ERROR") bm.query({'query': XML.format(species=species, go=go_id)}) for i in bm.response.content.decode().split("\n"): i = i.rstrip("\n") if i != '': is_associated[i] = 1 gtf = GTF(inputfile) gtf_associated = gtf.select_by_key("gene_id", ",".join(list(is_associated.keys())), invert_match) gtf_associated.write(outputfile, gc_off=True)
def intron_sizes( inputfile=None, outputfile=None, key_name=None): """ Add a new key to transcript features containing a comma-separated list of intron sizes. """ gtf = GTF(inputfile, check_ensembl_format=False) all_tx_ids = gtf.get_tx_ids(nr=True) intron_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False, feat_name=False) strands = gtf.select_by_key("feature", "transcript").extract_data("transcript_id,strand", as_dict_of_values=True, no_na=True, nr=True, hide_undef=True) intron_size = {tx: [] for tx in all_tx_ids} for bed_line in intron_bo: intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)] for tx_id in intron_size: if len(intron_size[tx_id]): if strands[tx_id] == "-": intron_size[tx_id] = ",".join(reversed(intron_size[tx_id])) else: intron_size[tx_id] = ",".join(intron_size[tx_id]) else: intron_size[tx_id] = "0" if len(intron_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def short_long(inputfile=None, outputfile=None, longs=None, keep_gene_lines=False): """ Select the shortest transcript for each gene, Or the longuest if the \ -l arguments is used. """ gtf = GTF(inputfile, check_ensembl_format=False) if longs: gtf = gtf.select_longuest_transcripts() else: gtf = gtf.select_shortest_transcripts() if not keep_gene_lines: gtf = gtf.select_by_key("feature", "gene", 1) gtf.write(outputfile, gc_off=True)
def nb_exons(inputfile=None, outputfile=None, key_name=None, text_format=False): """ Count the number of exons in the gtf file. """ gtf = GTF(inputfile) n_exons = defaultdict(int) # ------------------------------------------------------------------------- # Computing number of exon for each transcript in input GTF file # # ------------------------------------------------------------------------- message("Computing number of exons for each transcript in input GTF file.") exon = gtf.select_by_key("feature", "exon") fields = exon.extract_data("transcript_id") for i in fields: tx_id = i[0] n_exons[tx_id] += 1 if text_format: for tx_id in n_exons: outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) + "\ttranscript\n") else: if len(n_exons): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=n_exons, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def select_by_key(inputfile=None, outputfile=None, key=None, value=None, invert_match=False, file_with_values=None, col=0, select_transcripts=False, select_genes=False, select_exons=False, select_cds=False, select_start_codon=False, bed_format=False, log=False, separator="|", names="transcript_id"): """Select lines from a GTF file based on attributes and associated values. """ # ---------------------------------------------------------------------- # Check mode # ---------------------------------------------------------------------- if select_transcripts: key = "feature" value = "transcript" elif select_cds: key = "feature" value = "CDS" elif select_start_codon: key = "feature" value = "start_codon" elif select_genes: key = "feature" value = "gene" elif select_exons: key = "feature" value = "exon" elif file_with_values is None: if key is None or value is None: message( "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.", type="ERROR") elif file_with_values is not None: if key is None: message("Please set -k.", type="ERROR") if value is not None: message("The -f and -v arguments are mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # Load file with value # ---------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True) if log: feat_before = len(gtf) if not file_with_values: value_list = value.split(",") gtf = gtf.select_by_key(key, value, invert_match) else: value_list = [] for line in file_with_values: cols = line.split("\t") value_list += [cols[col - 1]] file_with_values.close() file_with_values = open(file_with_values.name) gtf = gtf.select_by_key(key=key, invert_match=invert_match, file_with_values=file_with_values, col=col) if log: not_found = list(set(value_list) - set(all_values)) feat_after = len(gtf) pct = feat_after / feat_before * 100 message("Number of features before selection: %d" % feat_before) message("Fraction of feature selected: %.2f%%" % pct) if len(not_found): nfj = ",".join(not_found) max_letter = min(len(nfj), 50) if len(nfj) > 50: etc = "..." else: etc = "" message("Values not found: [" + ",".join(not_found)[:max_letter] + etc + "].") else: message("Values not found: [].") # ---------------------------------------------------------------------- # Write GTF file # ---------------------------------------------------------------------- if not bed_format: gtf.write(outputfile, gc_off=True) else: nb_tokens = len(names.split(",")) keys = "seqid,start,end," + names + ",score,strand" nb_fields = len(keys.split(",")) for i in gtf.extract_data_iter_list(keys, zero_based=True): outputfile.write("\t".join([ i[0], i[1], i[2], separator.join(i[3:(3 + nb_tokens)]), i[nb_fields - 2], i[nb_fields - 1], ]) + "\n") close_properly(outputfile, inputfile)
def mk_matrix(inputfile=None, outputfile=None, bigwiglist=None, ft_type=None, pseudo_count=0, upstream=1000, downstream=1000, bin_around_frac=0.1, chrom_info=None, bin_nb=100, nb_proc=None, labels=None, no_stranded=False, zero_to_na=False): """ Description: Create a matrix to be used by 'profile' and 'heatmap' commands. """ # ------------------------------------------------------------------------- # Check argument consistency # # ------------------------------------------------------------------------- if ft_type in ['single_nuc', 'promoter', 'tts']: region_size = upstream + downstream + 1 if region_size < bin_nb: message( "The region (-u/-d) needs to be extended given the number " "of bins (--bin-nb)", type="ERROR") # ------------------------------------------------------------------------- # Check output file name does not ends with .zip # # ------------------------------------------------------------------------- if outputfile.name.endswith(".zip"): outfn = outputfile.name.replace(".zip", "") outputfile = open(outfn, "w") # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True if ft_type == 'user_regions': message( "--ft-type can not be set to user_regions" " when a gtf is provided.", type="ERROR") else: try: region_bo = BedTool(inputfile.name) len(region_bo) except IndexError: message("Unable to read the input file. Check format", type="ERROR") if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': message('Loading the GTF file.') gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False if ft_type != 'user_regions' and ft_type != 'single_nuc': message( "Set --ft-type to 'user_regions' or 'single_nuc'" " when using input bed file.", type="ERROR") # Check that the strand is provided and # check it is located in the right column # (not checked by BedTool...). if region_bo.field_count() < 6: if not no_stranded: message("Strand is undefined. Use -nst.", type="ERROR") else: region_name = dict() for i in region_bo: if region_name.get(i.name, None) is None: region_name[i.name] = 1 else: message( "Regions in bed file should have " "unique identifier (col 4).", type="ERROR") if i.strand[0] not in ['.', '+', '-']: message("Strand should be one of '+','-' or '.'.", type="ERROR") if ft_type == 'single_nuc': if i.end - i.start != 1: message( "Region length should be 1 nucleotide " "long when 'single_nuc' is set. Use 'user_regions'.", type="ERROR") elif ft_type == 'user_regions': if i.end - i.start == 1: message( "Region length should not be 1 nucleotide " "long when 'user_regions' is set. Use 'single_nuc'.", type="ERROR") # ------------------------------------------------------------------------- # Create a list of labels for the diagrams. # Take user input in account # ------------------------------------------------------------------------- message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bigwiglist): message( "The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bigwiglist)): labels += [ os.path.splitext(os.path.basename(bigwiglist[i].name))[0] ] # ------------------------------------------------------------------------- # # Get the requested transcrit lines in bed format # Tx are restricted to those found on chromosome # declared in the bigwig file. # ------------------------------------------------------------------------- message('Getting the list of chromosomes declared in bigwig files.') bw_chrom = list() for i in bigwiglist: bw_chrom += list(pyBigWig.open(i.name).chroms().keys()) bed_col = [0, 1, 2, 3, 4, 5] if is_gtf: message('Selecting chromosomes declared in bigwig from gtf.') tmp = gtf.select_by_key("feature", "transcript").select_by_key( "seqid", ",".join(bw_chrom)) tmp = gtf.select_by_key("feature", "transcript") tmp_tx_name = tmp.extract_data("transcript_id", as_list=True) # If several trancript records are associated to # the same transcript_id, raise an error. if len(tmp_tx_name) > len(set(tmp_tx_name)): message('Transcripts should have a unique identifier.', type="ERROR") message('Selecting requested regions.') # ---------------------------------------------------------------------- # # Slop tss and promoters. # No need if transcript was requested (it will be flanked by upstream # and doswnstream regions later on). # ---------------------------------------------------------------------- if ft_type == 'transcript': message("Getting transcript boundaries (input gtf).") main_region_bo = tmp.to_bed(name=["transcript_id"]) elif ft_type == 'promoter': message("Getting promoter regions [-%d,+%d]." % (upstream, downstream)) main_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) elif ft_type == 'tts': main_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Loading regions") if ft_type == 'user_regions': main_region_bo = BedTool(inputfile.name).cut(bed_col) elif ft_type == 'single_nuc': main_region_bo = BedTool(inputfile.name).cut(bed_col).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Unknown method.") # Save for tracability main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed") main_region_bo.saveas(main_region_bed.name) # ------------------------------------------------------------------------- # # Print a header in the output file # # ------------------------------------------------------------------------- message("Preparing comments") comments = "#" comments += "ft_type:" + ft_type + ";" comments += "from:" + str(upstream) + ";" comments += "to:" + str(downstream) + ";" comments += "labels:" + ",".join(labels) + ";" # ------------------------------------------------------------------------- # Compute coverage of requested region # Each worker will send a file # ------------------------------------------------------------------------- outputfile_list = {} message("Using %d bins for main region." % bin_nb) tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="main", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["main"] = tmp_file # ------------------------------------------------------------------------- # If transcript was requested # we must process flanking regions # We need to retrieve coverage of promoter [-upstream, 0] # as transcript coverage window size will depend on transcript length. # For promoter the length of windows will be fixed. # ------------------------------------------------------------------------- if ft_type in ['transcript', 'user_regions']: # Number of bins for TTS and TSS around_bin_nb = int(round(bin_nb * bin_around_frac)) if around_bin_nb < 1: around_bin_nb = 1 if upstream > 0: if ft_type == 'transcript': message("Getting promoter (using %d bins)." % around_bin_nb) ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col) else: message("Getting upstream regions (%d bins)." % around_bin_nb) ups_region_bo = main_region_bo.flank(s=True, l=upstream, r=0, g=chrom_info.name) upstream_bed_file = make_tmp_file(prefix="upstream_region" + ft_type, suffix=".bed") ups_region_bo.saveas(upstream_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="upstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["upstream"] = tmp_file if downstream > 0: if ft_type == 'transcript': message("Getting TTS (using %d bins)." % around_bin_nb) dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col) else: message("Getting downstream regions (%d bins)." % around_bin_nb) dws_region_bo = main_region_bo.flank(s=True, l=0, r=downstream, g=chrom_info.name) dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type, suffix=".bed") dws_region_bo.saveas(dws_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="downstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["downstream"] = tmp_file # ------------------------------------------------------------------------- # # Merge file using pandas # # ------------------------------------------------------------------------- message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG") df_main = pd.read_csv(outputfile_list["main"].name, sep="\t") # save strand and end # They will re-joined added later df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']] df_start = df_main.pop('start') df_end = df_main.pop('end') if "upstream" in outputfile_list: message("Merging upstream file") message("Reading (pandas): " + outputfile_list["upstream"].name, type="DEBUG") df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t") df_up = df_up.drop(['start', 'end'], 1) df_main = df_up.merge(df_main.loc[:, df_main.columns], on=['bwig', 'chrom', 'gene', 'strand']) if "downstream" in outputfile_list: message("Merging downstream file") message("Reading (pandas): " + outputfile_list["downstream"].name, type="DEBUG") df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t") df_dws = df_dws.drop(['start', 'end'], 1) df_main = df_main.merge(df_dws.loc[:, df_dws.columns], on=['bwig', 'chrom', 'gene', 'strand']) # join start and end. df_main = df_main.merge(df_copy.loc[:, df_copy.columns], on=['bwig', 'chrom', 'gene', 'strand']) df_start = df_main.pop('start') df_end = df_main.pop('end') df_main.insert(2, 'start', df_start) df_main.insert(3, 'end', df_end) message("Writing to file") outputfile.close() with open(outputfile.name, 'a') as f: f.write(comments + "\n") df_main.to_csv(f, sep="\t", index=False, mode='a', columns=df_main.columns, na_rep='NA') # ------------------------------------------------------------------------- # # Compress # # ------------------------------------------------------------------------- message("Compressing") path = os.path.abspath(outputfile.name) filename = os.path.basename(path) message("filename: " + filename, type="DEBUG") zip_filename = filename + '.zip' message("zip_filename: " + zip_filename, type="DEBUG") zip_path = os.path.join(os.path.dirname(path), zip_filename) message("zip_path: " + zip_path, type="DEBUG") with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf: zf.write(filename=path, arcname=filename) for i in outputfile_list: message("deleting " + outputfile_list[i].name) os.remove(outputfile_list[i].name) os.remove(outputfile.name) gc.disable() close_properly(inputfile, outputfile)
def get_feat_seq(inputfile=None, outputfile=None, genome=None, feature_type="exon", separator="", no_rev_comp=False, label="", rev_comp_to_header=False, unique=False): """ Description: Get transcripts sequences in fasta format from a GTF file. """ # ------------------------------------------------------------------------- # Should sequences be reverse-complemented # ------------------------------------------------------------------------- force_strandedness = not no_rev_comp # ------------------------------------------------------------------------- # Check chrom to avoid segfault # https://github.com/dputhier/libgtftk/issues/27 # ------------------------------------------------------------------------- if genome.name.endswith(".gz"): message("Genome in gz format is not currently supported.", type="ERROR") genome_chr_list = [] message("Fasta files found: %s" % genome.name) message("Checking fasta file chromosome list") with genome as geno: for i in geno: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] gtf = GTF(inputfile, check_ensembl_format=False) gtf_chr_list = gtf.get_chroms(nr=True) # Check chrom to avoid segfault # https://github.com/dputhier/libgtftk/issues/27 message("Comparing chromosomes from GTF and Fasta files.") gtf_chr_list_found = [x for x in gtf_chr_list if x in genome_chr_list] if len(gtf_chr_list_found) == 0: message("Chromosome from GTF were not found in fasta file", type="ERROR") if len(gtf_chr_list_found) != len(gtf_chr_list): not_found = [x for x in gtf_chr_list if x not in gtf_chr_list_found] message("Some chromosomes were not found in the fasta file: %s" % ",".join(not_found), type="ERROR") # ------------------------------------------------------------------------- # Retrieving fasta sequences # # ------------------------------------------------------------------------- message("Retrieving fasta sequences.") try: # The nameOnly argument is not supported # through all Bedtools versions feat_seq = gtf.select_by_key("feature", feature_type).to_bed( name=label.split(","), sep=separator).sequence(fi=genome.name, nameOnly=True, s=force_strandedness) except BEDToolsError: feat_seq = gtf.select_by_key("feature", feature_type).to_bed( name=label.split(","), sep=separator).sequence(fi=genome.name, name=True, s=force_strandedness) id_printed = set() to_print = True for _, line in enumerate(open(feat_seq.seqfn)): if line.startswith(">"): # This (+/-) may be added by pybedtool # but can be accessed though --label line = re.sub("\(\+\)$", "", line) line = re.sub("\(\-\)$", "", line) if rev_comp_to_header: if force_strandedness: line = line + separator + "rev_comp" else: line = line + separator + "no_rev_comp" if unique: if line in id_printed: to_print = False if to_print: outputfile.write(line) id_printed.add(line) else: if not to_print: to_print = True else: outputfile.write(line) gc.disable() close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def convergent(inputfile=None, outputfile=None, upstream=1500, downstream=1500, chrom_info=None): """ Find transcript with convergent tts. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_to_convergent_nm = dict() dist_to_convergent = dict() tts_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tts coordinates.") tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||") # get tts position for i in tts_bo: tx_id_ov, gn_id_ov = i.name.split("||") tts_pos[tx_id_ov] = int(i.start) message("Getting tts coordinates.") tts_region_bo = tts_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") tts_intersect_bo = tts_region_bo.intersect(tts_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("tts_slop", ".bed") tts_region_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed") tts_intersect_bo.saveas(tmp_file.name) for i in tts_intersect_bo: tx_id_main, gene_id_main = i.fields[3].split("||") tx_id_ov, gn_id_ov = i.fields[9].split("||") if gene_id_main != gn_id_ov: if tx_id_main in tx_to_convergent_nm: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) if dist < dist_to_convergent[tx_id_main]: dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov else: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov if len(tx_to_convergent_nm): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_convergent_nm, new_key="convergent") gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_convergent, new_key="dist_to_convergent") gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def get_5p_3p_coords(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", separator="|", more_names='', transpose=0, invert=False, explicit=False): """ Get the 5p or 3p coordinate for each feature (e.g TSS or TTS for a transcript). """ if more_names is None: more_names = [] else: more_names = more_names.split(',') if not invert: message("Computing 5' coordinates of '" + ft_type + "'.") else: message("Computing 3' coordinates of '" + ft_type + "'.") gtf = GTF(inputfile, check_ensembl_format=False) if names != "*": nms = names.split(",") else: nms = gtf.select_by_key("feature", "transcript").get_attr_list(add_basic=False) if not invert: bed_obj = gtf.get_5p_end(feat_type=ft_type, name=nms, sep=separator, more_name=more_names, explicit=explicit) else: bed_obj = gtf.get_3p_end(feat_type=ft_type, name=nms, sep=separator, more_name=more_names, explicit=explicit) if not len(bed_obj): message("Requested feature could not be found. Use convert_ensembl maybe.", type="ERROR") if transpose == 0: for i in bed_obj: write_properly(chomp(str(i)), outputfile) else: for i in bed_obj: out_list = list() if i.strand == "+": out_list = [i.chrom, str(i.start + transpose), str(i.end + transpose), i.name, i.score, i.strand] elif i.strand == "-": out_list = [i.chrom, str(i.start - transpose), str(i.end - transpose), i.name, i.score, i.strand] outputfile.write("\t".join(out_list) + "\n") gc.disable() close_properly(outputfile, inputfile)
def closest_genes( inputfile=None, outputfile=None, from_region_type=None, no_header=False, nb_neighbors=1, to_region_type=None, same_strandedness=False, diff_strandedness=False, text_format=False, identifier="gene_id", collapse=False): """ Find the n closest genes for each gene. """ if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") # ---------------------------------------------------------------------- # load GTF # ---------------------------------------------------------------------- gtf = GTF(inputfile) gn_gtf = gtf.select_by_key("feature", "gene") gn_ids = gn_gtf.get_gn_ids(nr=True) if len(gn_gtf) == 0: message("No gene feature found. Please use convert_ensembl.", type="ERROR") if nb_neighbors >= (len(gn_gtf) - 1): message("Two much neighbors", type="ERROR") all_ids = gn_gtf.extract_data(identifier, as_list=True, no_na=False) if "." in all_ids: message("Some identifiers are undefined ('.').", type="ERROR") if len(all_ids) == 0: message("The identifier was not found.", type="ERROR") # ---------------------------------------------------------------------- # load GTF and requested regions (for source/'from' transcript) # ---------------------------------------------------------------------- if from_region_type == 'tss': from_regions = gn_gtf.get_5p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif from_region_type == 'tts': from_regions = gn_gtf.get_3p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif from_region_type == 'gene': from_regions = gn_gtf.to_bed(name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() else: message("Unknown type.", type="ERROR") # ---------------------------------------------------------------------- # load GTF and requested regions (for dest/'to' transcript) # ---------------------------------------------------------------------- if to_region_type == 'tss': to_regions = gn_gtf.get_5p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif to_region_type == 'tts': to_regions = gn_gtf.get_3p_end(feat_type="gene", name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() elif to_region_type == 'gene': to_regions = gn_gtf.to_bed(name=[identifier], ).cut([0, 1, 2, 3, 4, 5]).sort() else: message("Unknown type.", type="ERROR") # ---------------------------------------------------------------------- # Search closest genes # ---------------------------------------------------------------------- gene_closest = defaultdict(list) gene_closest_dist = defaultdict(list) closest_bo = from_regions.closest(b=to_regions, k=nb_neighbors, N=True, s=same_strandedness, S=diff_strandedness, d=True) for i in closest_bo: gene_closest[i[3]] += [i[9]] gene_closest_dist[i[3]] += [i[12]] if not text_format: if len(gene_closest): gtf = gtf.add_attr_from_dict(feat="gene", key=identifier, a_dict=gene_closest, new_key="closest_gn") gtf = gtf.add_attr_from_dict(feat="gene", key=identifier, a_dict=gene_closest_dist, new_key="closest_dist") gtf.write(outputfile, gc_off=True) else: if not no_header: outputfile.write("genes\tclosest_genes\tdistances\n") for gene in gn_ids: if not collapse: outputfile.write("\t".join([gene, ",".join(gene_closest[gene]), ",".join(gene_closest_dist[gene])]) + "\n") else: for closest, dist in zip(gene_closest[gene], gene_closest_dist[gene]): outputfile.write("\t".join([gene, closest, dist]) + "\n") gc.disable() close_properly(outputfile, inputfile)
def overlapping( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, feature_type='transcript', same_strandedness=False, diff_strandedness=False, annotate_gtf=False, bool=False, annotate_all=False, invert_match=False): """ Description: Find transcripts whose body/TSS/TTS do or do not overlap with any transcript from another gene. """ # ---------------------------------------------------------------------- # Prepare key names # ---------------------------------------------------------------------- if annotate_gtf: if key_name is None: key_info = ["overlap", feature_type, "u" + str(upstream / 1000) + "k", "d" + str(downstream / 1000) + "k" ] key_name = "_".join(key_info) if invert_match: message("--annotate-gtf and --invert-match are " "mutually exclusive.", type="ERROR") if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") message("Using -u " + str(upstream)) message("Using -d " + str(downstream)) overlapping_tx = defaultdict(list) # Load the GTF so that it won't be lost # if GTF stream comes from stdin gtf = GTF(inputfile) message("Getting transcript in bed format") tx_feat = gtf.select_by_key("feature", "transcript") if annotate_all: overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0") for i in overlapping_tx: overlapping_tx[i] = [] # ---------------------------------------------------------------------- # Get transcript limits # ---------------------------------------------------------------------- tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||") message("Getting " + feature_type + " and 'slopping'.") if feature_type == "transcript": bed_obj = tx_bed.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "promoter": bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "tts": bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) else: message("Not implemented yet", type="ERROR") tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed") bed_obj.saveas(tmp_file.name) overlap_regions = bed_obj.intersect(tx_bed, wb=True, s=same_strandedness, S=diff_strandedness) tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed") overlap_regions.saveas(tmp_file.name) for i in overlap_regions: tx_other, gn_other = i.fields[9].split("||") tx_id, gene_id = i.fields[3].split("||") if gene_id != gn_other: overlapping_tx[tx_id] += [tx_other] if bool: for k, _ in overlapping_tx.items(): if not len(overlapping_tx[k]): overlapping_tx[k] = "0" else: overlapping_tx[k] = "1" if not invert_match: if not annotate_gtf: value = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", value).write(outputfile, gc_off=True) else: if len(overlapping_tx): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=overlapping_tx, new_key=key_name) gtf.write(outputfile, gc_off=True) else: values = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", values, invert_match).write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def coverage( inputfile=None, outputfile=None, bw_list=None, labels=None, pseudo_count=1, nb_window=1, ft_type="promoter", n_highest=None, downstream=1000, key_name="cov", zero_to_na=False, name_column=None, upstream=1000, chrom_info=None, nb_proc=1, matrix_out=False, stat='mean'): """ Compute transcript coverage with one or several bigWig. """ # ------------------------------------------------------------------------- # Create a list of labels. # Take user input in account # ------------------------------------------------------------------------- bw_list = [x.name for x in bw_list] if len(bw_list) != len(set(bw_list)): message("Found the same bigwigs several times.", type="ERROR") message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bw_list): message("The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bw_list)): labels += [ os.path.splitext( os.path.basename( bw_list[i]))[0]] # ------------------------------------------------------------------------- # Check the number of windows # # ------------------------------------------------------------------------- if n_highest is None: n_highest = nb_window message('Number of bins: %d' % nb_window) message('N highest values: %d' % n_highest) if n_highest > nb_window: message('The number of window used for computing the score' ' (-n) can not be greater than the number of' ' windows (-w)', type="ERROR") sys.exit() # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False # ------------------------------------------------------------------------- # Get regions of interest # # ------------------------------------------------------------------------- name_column = name_column.split(",") if is_gtf: message("Getting regions of interest...") if ft_type.lower() == "intergenic": region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() == "intron": region_bo = gtf.get_introns().slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type == "intron_by_tx": region_bo = gtf.get_introns(by_transcript=True, name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["promoter", "tss"]: region_bo = gtf.get_tss(name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["tts", "terminator"]: region_bo = gtf.get_tts(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() else: region_bo = gtf.select_by_key( "feature", ft_type, 0 ).to_bed(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") else: region_bo = region_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() region_bed = make_tmp_file(prefix="region", suffix=".bed") region_bo.saveas(region_bed.name) # ------------------------------------------------------------------------- # Compute coverage # # ------------------------------------------------------------------------- result_bed = bw_cov_mp(bw_list=bw_list, region_file=open(region_bed.name), labels=labels, bin_nb=nb_window, pseudo_count=pseudo_count, zero_to_na=zero_to_na, nb_proc=nb_proc, n_highest=n_highest, stat=stat, verbose=pygtftk.utils.VERBOSITY) if matrix_out: result_bed.close() df_first = pd.read_csv(result_bed.name, sep="\t", header=None) df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]] df_list = [] for i in range(len(labels)): # create a sub data frame containing the coverage values of the # current bwig str_to_find = r"^" + labels[i] + r"\|" tmp_df = df_first[df_first[3].str.match(str_to_find)].copy() to_replace = r"^" + labels[i] + r"\|" tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace, r"", regex=True) df_list += [tmp_df] df_final = df_list.pop(0) for i in df_list: # Add columns to df final by joining on # chrom, start, end, transcript_id, strand df_final = df_final.merge(i.iloc[:, list(range(6))], on=[0, 1, 2, 3, 5]) df_final.columns = ["chrom", "start", "end", "name", "strand"] + labels df_final.to_csv(outputfile, sep="\t", index=False) else: nb_line = 0 for i in result_bed: outputfile.write(i) nb_line += 1 if nb_line == 0: message("No line available in output...", type="ERROR") gc.disable() close_properly(inputfile, outputfile)
def get_tx_seq(inputfile=None, outputfile=None, genome=None, with_introns=False, delete_version=False, del_chr=False, separator="", no_rev_comp=False, label="", sleuth_format=True, explicit=True, assembly="bla"): """ Description: Get transcripts sequences in fasta format from a GTF file. """ # ----------------------------------------------------------- # Check chromosomes in fasta file # ----------------------------------------------------------- genome_chr_list = [] message("%d fasta files found." % len(genome)) as_gz_ext = [True for x in genome if x.name.endswith(".gz")] if any(as_gz_ext): message("Genome in gz format is not currently supported.", type="ERROR") if len(genome) == 1: message("Checking fasta file chromosome list") genome = genome[0] with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] else: message("Merging fasta files") tmp_genome = make_tmp_file(prefix="genome", suffix=".fa") with tmp_genome as tg: for curr_file in genome: message("Merging %s" % curr_file.name) with curr_file as cf: shutil.copyfileobj(cf, tg, 1024 * 1024 * 100) message("Checking fasta file chromosome list") genome = open(tmp_genome.name, "r") with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] rev_comp = not no_rev_comp message("Chromosomes in fasta file: " + ",".join(genome_chr_list)) # ----------------------------------------------------------- # Read gtf # ----------------------------------------------------------- gtf = GTF(inputfile) nb_tx_before = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) # ----------------------------------------------------------- # Select genes falling in chrom defined in the fasta file # ----------------------------------------------------------- message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) message("Selecting chromosome defined in the fasta file") gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list)) message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) if len(gtf) == 0: message("No genes were found on chromosomes defined in fasta file.", type="ERROR") nb_tx_after = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) if len(nb_tx_after) != len(nb_tx_before): diff = list(set(nb_tx_before) - set(nb_tx_after)) message("Some transcripts had" " no corresponding chromosome" " in the fasta file: " + ",".join(diff)[0:100] + "...") message("Using genome file: " + genome.name) message("Retrieving fasta sequences from " + genome.name) fasta_seq = gtf.get_sequences(genome=genome.name, intron=with_introns, rev_comp=rev_comp) tx_gtf = gtf.select_by_key("feature", "transcript") if sleuth_format: tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype", as_dict_of_lists=True, hide_undef=False) gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype", as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: gene_id = i.gene_id transcript_id = i.transcript_id chrom = i.chrom gn_bio = gn_biotype[i.gene_id][0] tx_bio = tx_biotype[i.transcript_id][0] if delete_version: transcript_id = re.sub('\.[0-9]+$', '', transcript_id) gene_id = re.sub('\.[0-9]+$', '', gene_id) if del_chr: chrom = chrom.replace('chr', '') header = " ".join([ transcript_id, ":".join([ "chromosome", assembly, chrom, str(i.start), str(i.end), "1" ]), "gene:" + gene_id, "gene_biotype:" + gn_bio, "transcript_biotype:" + tx_bio ]) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") else: tx_info = tx_gtf.extract_data("transcript_id," + label, as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: if not explicit: header = separator.join(tx_info[i.transcript_id]) else: header = [ str(x[0]) + "=" + x[1] for x in zip(label.split(","), tx_info[i.transcript_id]) ] header = separator.join(header) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") gc.disable() close_properly(outputfile, inputfile)
def select_by_intron_size(inputfile=None, outputfile=None, intron_size=0, merged=False, invert_match=False, delete_monoexonic=False, add_intron_size=False): """ Select genes which contain an intron of size at least s or whose sum of intron size is at least s """ message("Searching for intronic regions.") gtf = GTF(inputfile, check_ensembl_format=False) introns_bo = gtf.get_introns(by_transcript=True, name=["transcript_id"], intron_nb_in_name=False).sort() # Get the list of transcripts all_tx_ids = gtf.get_tx_ids(nr=True) # The list of transcripts # to be deleted to_delete = OrderedDict() if merged: # Create a dict that will contain the sum of introns for # each transcript intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0) for i in introns_bo: size = i.end - i.start tx_id = i.name intron_sum_dict[tx_id] += size for tx_id, sum_intron in list(intron_sum_dict.items()): if sum_intron != 0: if not invert_match: if sum_intron < intron_size: to_delete[tx_id] = 1 else: if sum_intron >= intron_size: to_delete[tx_id] = 1 else: if delete_monoexonic: to_delete[tx_id] = 1 if add_intron_size: gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_sum_dict, new_key="intron_size_sum") else: # Create a dict that will contain a list introns size # for each transcript intron_size_dict = defaultdict(list) for tx_id in all_tx_ids: intron_size_dict[tx_id] = [] for i in introns_bo: size = i.end - i.start tx_id = i.name intron_size_dict[tx_id] += [size] for tx_id, list_size in list(intron_size_dict.items()): if not list_size: intron_size_dict[tx_id] = [0] if delete_monoexonic: to_delete[tx_id] = 1 continue for size in intron_size_dict[tx_id]: if not invert_match: if size < intron_size: to_delete[tx_id] = 1 else: if size >= intron_size: to_delete[tx_id] = 1 if add_intron_size: for tx_id, list_size in list(intron_size_dict.items()): list_size = [str(x) for x in list_size] intron_size_dict[tx_id] = ",".join(list_size) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=intron_size_dict, new_key="intron_size") all_tx_ids = gtf.get_tx_ids(nr=True) all_tx_ids = [x for x in all_tx_ids if x not in to_delete] msg_list = ",".join(list(to_delete.keys())) nb_char = min([len(msg_list), 40]) msg_list = msg_list[0:nb_char] message("Deleting: " + msg_list + "...") gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids)) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def divergent( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, no_strandness=False, no_annotation=False): """ Find transcript with divergent promoters. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_with_divergent = dict() dist_to_divergent = dict() tss_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tss coordinates.") tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||") # get tss position for i in tss_bo: tx_id_tss, gn_id_tss = i.name.split("||") tss_pos[tx_id_tss] = int(i.start) message("Getting promoter coordinates.") promoter_bo = tss_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") if no_strandness: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=False) else: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("promoter_slop", ".bed") promoter_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed") prom_with_tss_bo.saveas(tmp_file.name) for i in prom_with_tss_bo: tx_id_tss, gn_id_tss = i.fields[9].split("||") tx_id_prom, gene_id_prom = i.fields[3].split("||") if gene_id_prom != gn_id_tss: if tx_id_prom in tx_with_divergent: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) if dist < dist_to_divergent[tx_id_prom]: dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss else: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss if not no_annotation: if key_name is None: key_name = "divergent" key_name_dist = "dist_to_divergent" else: key_name_dist = "dist_" + key_name if len(tx_with_divergent): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_with_divergent, new_key=key_name) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_divergent, new_key=key_name_dist) gtf.write(outputfile, gc_off=True) else: gtf.select_by_key("transcript_id", ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def rm_dup_tss(inputfile=None, outputfile=None): """If several transcripts of a gene share the same tss, select only one.""" # ---------------------------------------------------------------------- # Get the TSS # ---------------------------------------------------------------------- gtf = GTF(inputfile) tss_bo = gtf.get_tss(["gene_id", "transcript_id"]) # ---------------------------------------------------------------------- # Sort the file by name (4th col) to ensure reproducibility between calls. # ---------------------------------------------------------------------- with open(tss_bo.fn) as f: lines = [line.split('\t') for line in f] tmp_file = make_tmp_file(prefix="tss_sorted_by_tx_id", suffix=".bed") for line in sorted(lines, key=operator.itemgetter(3)): tmp_file.write('\t'.join(line)) tmp_file.close() tss_bo = BedTool(tmp_file.name) # ---------------------------------------------------------------------- # Get the list of non redundant TSSs # ---------------------------------------------------------------------- gene_dict = defaultdict(dict) to_delete = [] message("Looking for redundant TSS (gene-wise).") for line in tss_bo: tss = line.start name = line.name gene_id, tx_id = name.split("|") if gene_id in gene_dict: if tss not in gene_dict[gene_id]: gene_dict[gene_id][tss] = tx_id else: to_delete += [tx_id] else: gene_dict[gene_id][tss] = tx_id message("Deleted transcripts: " + ",".join(to_delete[1:min(10, len(to_delete))]) + "...", type="DEBUG") # ---------------------------------------------------------------------- # Write # ---------------------------------------------------------------------- gtf.select_by_key("feature", "gene", invert_match=True).select_by_key( "transcript_id", ",".join(to_delete), invert_match=True).write(outputfile, gc_off=True)