def __call__(self, parser, namespace, values, option_string=None): if values in ["mm8", "mm9", "mm10", "hg19", "hg38", "rn3", "rn4"]: chr_size = pybedtools.helpers.chromsizes(values) ## Delete haplotype chromosome ## unplaced contig and unlocalized contig regexp = re.compile('(_random)|(^chrUn)|(_hap\d+)|(_alt)|(^chrM$)') chr_size = {key: chr_size[key] for key in chr_size if not regexp.search(key)} tmp_file_chr = make_tmp_file(prefix='chromsize', suffix='.txt') for chrom, size in chr_size.items(): tmp_file_chr.write(chrom + "\t" + str(size[1]) + "\n") tmp_file_chr.close() values = open(tmp_file_chr.name, 'r') elif values == 'simple': chr_size = {'chr1': 300, 'chr2': 600} tmp_file_chr = make_tmp_file(prefix='chromsize', suffix='.txt') for chrom, size in chr_size.items(): tmp_file_chr.write(chrom + "\t" + str(size) + "\n") tmp_file_chr.close() values = open(tmp_file_chr.name, 'r') else: check_file_or_dir_exists(values) values = open(values, "r") chrom_info_as_dict(values) # Add the attribute setattr(namespace, self.dest, values)
def bed_to_gtf(inputfile=None, outputfile=None, ft_type="transcript", source="Unknown"): """ Convert a bed file to a gtf. This will make the poor bed feel as if it was a nice gtf (but with lots of empty fields...). May be helpful sometimes... """ message("Converting the bed file into GTF file.") if inputfile.name == '<stdin>': tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed") for i in inputfile: write_properly(chomp(str(i)), tmp_file) tmp_file.close() inputfile.close() bed_obj = BedTool(tmp_file.name) else: bed_obj = BedTool(inputfile.name) n = 1 for i in bed_obj: if i.strand == "": i.strand = "." if i.name == "": i.name = str("feature_" + str(n)) if i.score == "": i.score = "0" if ft_type == "exon": key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\"; " + \ "exon_id \"" + i.name + "\";" elif ft_type == "gene": key_value = "gene_id \"" + i.name + "\";" else: key_value = "gene_id \"" + i.name + "\"; " + \ "transcript_id \"" + i.name + "\";" if pygtftk.utils.ADD_CHR == 1: chrom_out = "chr" + i.chrom else: chrom_out = i.chrom list_out = [ chrom_out, source, ft_type, str(i.start + 1), str(i.end), str(i.score), i.strand, ".", key_value ] write_properly("\t".join(list_out), outputfile) n += 1 gc.disable() close_properly(outputfile)
def nb_transcripts(inputfile=None, outputfile=None, text_format=False, key_name=""): """ Compute the number of transcript per gene. """ gtf = GTF(inputfile) message("Computing the number of transcript per gene in input GTF file.") # Computation of transcript number is performed on exon lines # Just in case some transcript lines would be lacking (but they should # not...) n_tx = gtf.get_gn_to_tx() if not text_format: tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt") for i in n_tx: if not text_format: tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n") else: outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n") if not text_format: tmp_file.close() gtf.add_attr_from_file(feat="gene", key="gene_id", new_key=key_name, inputfile=tmp_file.name).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def discretize_key(inputfile=None, outputfile=None, src_key=None, dest_key="disc_key", nb_levels=2, percentiles=False, percentiles_of_uniq=False, precision=2, log=False, labels=None): """ Create a new key by discretizing a numeric key. """ if nb_levels < 2: message("--nb-levels has to be greater than 2.", type="ERROR") # ------------------------------------------------------------------------- # # Check labels and nb_levels # # ------------------------------------------------------------------------- if labels is not None: labels = labels.split(",") if len(labels) != nb_levels: message( "The number of labels should be the same as the number of levels.", type="ERROR") if len(labels) != len(set(labels)): message("Redundant labels not allowed.", type="ERROR") # ------------------------------------------------------------------------- # # Load GTF. Retrieve values for src-key # # ------------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) src_values = gtf.extract_data(src_key, as_list=True) if len([x for x in src_values if x not in ['.', '?']]) == 0: message('The key was not found in this GTF.', type="ERROR") min_val = None max_val = None dest_values = [] dest_pos = [] for p, v in enumerate(src_values): try: a = float(v) if min_val is not None: if a > max_val: max_val = a if a < min_val: min_val = a else: min_val = a max_val = a dest_values += [a] dest_pos += [p] except ValueError: pass if min_val is None: message("Did not find numeric values in the source key.", type="ERROR") if min_val == max_val: message( "The minimum and maximum values found in the source key are the same.", type="ERROR") if log: if 0 in dest_values: message("Encountered zero values before log transformation.", type="WARNING", force=True) message("Adding a pseudocount (+1).", type="WARNING", force=True) pseudo_count = 1 dest_values = list(np.log2([x + pseudo_count for x in dest_values])) # update max/min values max_val = max(dest_values) min_val = min(dest_values) # Apply the same rule as pandas.cut when bins is an int. min_val = min_val - max_val / 1000 # ------------------------------------------------------------------------- # # Compute percentiles if required # # ------------------------------------------------------------------------- if percentiles: if percentiles_of_uniq: dest_values_tmp = [min_val] + list(set(dest_values)) else: dest_values_tmp = [min_val] + dest_values n = nb_levels q = [np.percentile(dest_values_tmp, 100 / n * i) for i in range(0, n)] q = q + [np.percentile(dest_values_tmp, 100)] if len(q) != len(set(q)): message("No ties are accepted in percentiles.", type="WARNING", force=True) message("Breaks: " + str(q), type="WARNING", force=True) message("Try -u. Exiting", type="ERROR") # ------------------------------------------------------------------------- # # Create a factor # # ------------------------------------------------------------------------- if percentiles: (breaks, cat_label) = pandas.cut(dest_values, bins=q, labels=labels, retbins=True) else: (breaks, cat_label) = pandas.cut(dest_values, bins=nb_levels, labels=labels, retbins=True) if labels is None: # The include_lowest argument of pandas is not working. # Using this workaround to avoid minimum value outside of data range. cat_label[0] = min(dest_values) cat_label = [round(x, precision) for x in cat_label] if precision == 0: cat_label = [int(x) for x in cat_label] cat_label = [str(x) for x in list(zip(cat_label[:-1], cat_label[1:]))] cat_label[0] = cat_label[0].replace("(", "[") cat_label = [x.replace(")", "]") for x in cat_label] cat_label = [str(x).replace(", ", "_") for x in cat_label] # The string can be very problematic later... breaks.categories = cat_label message("Categories: " + str(list(breaks.categories)), type="INFO", force=True) # ------------------------------------------------------------------------- # # Write to disk # # ------------------------------------------------------------------------- tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt") with tmp_file as tp_file: for p, v in zip(dest_pos, breaks): tp_file.write(str(p) + "\t" + str(v) + '\n') gtf.add_attr_to_pos(tmp_file, new_key=dest_key).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def overlapping( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, feature_type='transcript', same_strandedness=False, diff_strandedness=False, annotate_gtf=False, bool=False, annotate_all=False, invert_match=False): """ Description: Find transcripts whose body/TSS/TTS do or do not overlap with any transcript from another gene. """ # ---------------------------------------------------------------------- # Prepare key names # ---------------------------------------------------------------------- if annotate_gtf: if key_name is None: key_info = ["overlap", feature_type, "u" + str(upstream / 1000) + "k", "d" + str(downstream / 1000) + "k" ] key_name = "_".join(key_info) if invert_match: message("--annotate-gtf and --invert-match are " "mutually exclusive.", type="ERROR") if same_strandedness and diff_strandedness: message("--same-strandedness and --diff-strandedness are " "mutually exclusive.", type="ERROR") message("Using -u " + str(upstream)) message("Using -d " + str(downstream)) overlapping_tx = defaultdict(list) # Load the GTF so that it won't be lost # if GTF stream comes from stdin gtf = GTF(inputfile) message("Getting transcript in bed format") tx_feat = gtf.select_by_key("feature", "transcript") if annotate_all: overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0") for i in overlapping_tx: overlapping_tx[i] = [] # ---------------------------------------------------------------------- # Get transcript limits # ---------------------------------------------------------------------- tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||") message("Getting " + feature_type + " and 'slopping'.") if feature_type == "transcript": bed_obj = tx_bed.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "promoter": bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) elif feature_type == "tts": bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||").slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) else: message("Not implemented yet", type="ERROR") tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed") bed_obj.saveas(tmp_file.name) overlap_regions = bed_obj.intersect(tx_bed, wb=True, s=same_strandedness, S=diff_strandedness) tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed") overlap_regions.saveas(tmp_file.name) for i in overlap_regions: tx_other, gn_other = i.fields[9].split("||") tx_id, gene_id = i.fields[3].split("||") if gene_id != gn_other: overlapping_tx[tx_id] += [tx_other] if bool: for k, _ in overlapping_tx.items(): if not len(overlapping_tx[k]): overlapping_tx[k] = "0" else: overlapping_tx[k] = "1" if not invert_match: if not annotate_gtf: value = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", value).write(outputfile, gc_off=True) else: if len(overlapping_tx): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=overlapping_tx, new_key=key_name) gtf.write(outputfile, gc_off=True) else: values = ",".join(set(overlapping_tx.keys())) gtf.select_by_key("transcript_id", values, invert_match).write(outputfile, gc_off=True) gc.disable() close_properly(outputfile, inputfile)
def mk_matrix(inputfile=None, outputfile=None, bigwiglist=None, ft_type=None, pseudo_count=0, upstream=1000, downstream=1000, bin_around_frac=0.1, chrom_info=None, bin_nb=100, nb_proc=None, labels=None, no_stranded=False, zero_to_na=False): """ Description: Create a matrix to be used by 'profile' and 'heatmap' commands. """ # ------------------------------------------------------------------------- # Check argument consistency # # ------------------------------------------------------------------------- if ft_type in ['single_nuc', 'promoter', 'tts']: region_size = upstream + downstream + 1 if region_size < bin_nb: message( "The region (-u/-d) needs to be extended given the number " "of bins (--bin-nb)", type="ERROR") # ------------------------------------------------------------------------- # Check output file name does not ends with .zip # # ------------------------------------------------------------------------- if outputfile.name.endswith(".zip"): outfn = outputfile.name.replace(".zip", "") outputfile = open(outfn, "w") # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True if ft_type == 'user_regions': message( "--ft-type can not be set to user_regions" " when a gtf is provided.", type="ERROR") else: try: region_bo = BedTool(inputfile.name) len(region_bo) except IndexError: message("Unable to read the input file. Check format", type="ERROR") if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': message('Loading the GTF file.') gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False if ft_type != 'user_regions' and ft_type != 'single_nuc': message( "Set --ft-type to 'user_regions' or 'single_nuc'" " when using input bed file.", type="ERROR") # Check that the strand is provided and # check it is located in the right column # (not checked by BedTool...). if region_bo.field_count() < 6: if not no_stranded: message("Strand is undefined. Use -nst.", type="ERROR") else: region_name = dict() for i in region_bo: if region_name.get(i.name, None) is None: region_name[i.name] = 1 else: message( "Regions in bed file should have " "unique identifier (col 4).", type="ERROR") if i.strand[0] not in ['.', '+', '-']: message("Strand should be one of '+','-' or '.'.", type="ERROR") if ft_type == 'single_nuc': if i.end - i.start != 1: message( "Region length should be 1 nucleotide " "long when 'single_nuc' is set. Use 'user_regions'.", type="ERROR") elif ft_type == 'user_regions': if i.end - i.start == 1: message( "Region length should not be 1 nucleotide " "long when 'user_regions' is set. Use 'single_nuc'.", type="ERROR") # ------------------------------------------------------------------------- # Create a list of labels for the diagrams. # Take user input in account # ------------------------------------------------------------------------- message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bigwiglist): message( "The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bigwiglist)): labels += [ os.path.splitext(os.path.basename(bigwiglist[i].name))[0] ] # ------------------------------------------------------------------------- # # Get the requested transcrit lines in bed format # Tx are restricted to those found on chromosome # declared in the bigwig file. # ------------------------------------------------------------------------- message('Getting the list of chromosomes declared in bigwig files.') bw_chrom = list() for i in bigwiglist: bw_chrom += list(pyBigWig.open(i.name).chroms().keys()) bed_col = [0, 1, 2, 3, 4, 5] if is_gtf: message('Selecting chromosomes declared in bigwig from gtf.') tmp = gtf.select_by_key("feature", "transcript").select_by_key( "seqid", ",".join(bw_chrom)) tmp = gtf.select_by_key("feature", "transcript") tmp_tx_name = tmp.extract_data("transcript_id", as_list=True) # If several trancript records are associated to # the same transcript_id, raise an error. if len(tmp_tx_name) > len(set(tmp_tx_name)): message('Transcripts should have a unique identifier.', type="ERROR") message('Selecting requested regions.') # ---------------------------------------------------------------------- # # Slop tss and promoters. # No need if transcript was requested (it will be flanked by upstream # and doswnstream regions later on). # ---------------------------------------------------------------------- if ft_type == 'transcript': message("Getting transcript boundaries (input gtf).") main_region_bo = tmp.to_bed(name=["transcript_id"]) elif ft_type == 'promoter': message("Getting promoter regions [-%d,+%d]." % (upstream, downstream)) main_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) elif ft_type == 'tts': main_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Loading regions") if ft_type == 'user_regions': main_region_bo = BedTool(inputfile.name).cut(bed_col) elif ft_type == 'single_nuc': main_region_bo = BedTool(inputfile.name).cut(bed_col).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Unknown method.") # Save for tracability main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed") main_region_bo.saveas(main_region_bed.name) # ------------------------------------------------------------------------- # # Print a header in the output file # # ------------------------------------------------------------------------- message("Preparing comments") comments = "#" comments += "ft_type:" + ft_type + ";" comments += "from:" + str(upstream) + ";" comments += "to:" + str(downstream) + ";" comments += "labels:" + ",".join(labels) + ";" # ------------------------------------------------------------------------- # Compute coverage of requested region # Each worker will send a file # ------------------------------------------------------------------------- outputfile_list = {} message("Using %d bins for main region." % bin_nb) tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="main", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["main"] = tmp_file # ------------------------------------------------------------------------- # If transcript was requested # we must process flanking regions # We need to retrieve coverage of promoter [-upstream, 0] # as transcript coverage window size will depend on transcript length. # For promoter the length of windows will be fixed. # ------------------------------------------------------------------------- if ft_type in ['transcript', 'user_regions']: # Number of bins for TTS and TSS around_bin_nb = int(round(bin_nb * bin_around_frac)) if around_bin_nb < 1: around_bin_nb = 1 if upstream > 0: if ft_type == 'transcript': message("Getting promoter (using %d bins)." % around_bin_nb) ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col) else: message("Getting upstream regions (%d bins)." % around_bin_nb) ups_region_bo = main_region_bo.flank(s=True, l=upstream, r=0, g=chrom_info.name) upstream_bed_file = make_tmp_file(prefix="upstream_region" + ft_type, suffix=".bed") ups_region_bo.saveas(upstream_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="upstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["upstream"] = tmp_file if downstream > 0: if ft_type == 'transcript': message("Getting TTS (using %d bins)." % around_bin_nb) dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col) else: message("Getting downstream regions (%d bins)." % around_bin_nb) dws_region_bo = main_region_bo.flank(s=True, l=0, r=downstream, g=chrom_info.name) dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type, suffix=".bed") dws_region_bo.saveas(dws_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="downstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["downstream"] = tmp_file # ------------------------------------------------------------------------- # # Merge file using pandas # # ------------------------------------------------------------------------- message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG") df_main = pd.read_csv(outputfile_list["main"].name, sep="\t") # save strand and end # They will re-joined added later df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']] df_start = df_main.pop('start') df_end = df_main.pop('end') if "upstream" in outputfile_list: message("Merging upstream file") message("Reading (pandas): " + outputfile_list["upstream"].name, type="DEBUG") df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t") df_up = df_up.drop(['start', 'end'], 1) df_main = df_up.merge(df_main.loc[:, df_main.columns], on=['bwig', 'chrom', 'gene', 'strand']) if "downstream" in outputfile_list: message("Merging downstream file") message("Reading (pandas): " + outputfile_list["downstream"].name, type="DEBUG") df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t") df_dws = df_dws.drop(['start', 'end'], 1) df_main = df_main.merge(df_dws.loc[:, df_dws.columns], on=['bwig', 'chrom', 'gene', 'strand']) # join start and end. df_main = df_main.merge(df_copy.loc[:, df_copy.columns], on=['bwig', 'chrom', 'gene', 'strand']) df_start = df_main.pop('start') df_end = df_main.pop('end') df_main.insert(2, 'start', df_start) df_main.insert(3, 'end', df_end) message("Writing to file") outputfile.close() with open(outputfile.name, 'a') as f: f.write(comments + "\n") df_main.to_csv(f, sep="\t", index=False, mode='a', columns=df_main.columns, na_rep='NA') # ------------------------------------------------------------------------- # # Compress # # ------------------------------------------------------------------------- message("Compressing") path = os.path.abspath(outputfile.name) filename = os.path.basename(path) message("filename: " + filename, type="DEBUG") zip_filename = filename + '.zip' message("zip_filename: " + zip_filename, type="DEBUG") zip_path = os.path.join(os.path.dirname(path), zip_filename) message("zip_path: " + zip_path, type="DEBUG") with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf: zf.write(filename=path, arcname=filename) for i in outputfile_list: message("deleting " + outputfile_list[i].name) os.remove(outputfile_list[i].name) os.remove(outputfile.name) gc.disable() close_properly(inputfile, outputfile)
def __call__(self, string): # --------------------------------------------------------------- # Check file extension # --------------------------------------------------------------- fasta_format_1 = '(\.[Ff][Aa][Ss][Tt][Aa]$)|(\.[Ff][Nn][Aa]$)' fasta_format_2 = '|(\.[Ff][Aa]$)|(\.[Ff][Aa][Ss]$)|(\.[Ff][Ff][Nn]$)|(\.[Ff][Rr][Nn]$)' fasta_regexp = fasta_format_1 + fasta_format_2 fasta_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", fasta_regexp) bed_regexp = '\.[Bb][Ee][Dd][3456]{0,1}$' bed_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", bed_regexp) gtf_regexp = '\.[Gg][Tt][Ff]$' gtf_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", gtf_regexp) txt_regexp = '(\.[Tt][Xx][Tt]$)|(\.[Cc][Ss][Vv]$)|(\.[Dd][Ss][Vv]$)|(\.[Tt][Aa][Bb]$)|(\.[Tt][Ss][Vv]$)' txt_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", txt_regexp) bigwig_regexp = '(\.[Bb][Ww]$)|(\.[Bb][Ii][Gg][Ww][Ii][Gg]$)' zip_regexp = '\.[Zz][Ii][Pp]$' pdf_regexp = '\.[Pp][Dd][Ff]$' ext2regexp = {'bed': bed_regexp, 'bed.gz': bed_regexp_gz, 'gtf': gtf_regexp, 'gtf.gz': gtf_regexp_gz, 'fasta': fasta_regexp, 'fasta.gz': fasta_regexp_gz, 'txt': txt_regexp, 'txt.gz': txt_regexp_gz, 'bigwig': bigwig_regexp, 'zip': zip_regexp, 'pdf': pdf_regexp} # Set verbosity system wide as depending on # command line argument order, VERBOSITY (-V) can # be evaluated later... if '-V' in sys.argv: sys_args = ' '.join(sys.argv) verbosity_val = re.search('-V ?([01234])?', sys_args) if verbosity_val: pygtftk.utils.VERBOSITY = int(verbosity_val.group(1)) else: pygtftk.utils.VERBOSITY = 0 match = False if isinstance(self.file_ext, str): extension_list = [self.file_ext] else: extension_list = list(self.file_ext) for this_ext in extension_list: if re.search(ext2regexp[this_ext], string): match = True break if not match: message('Not a valid filename extension :' + string, type="WARNING") message('Extension expected: ' + ext2regexp[this_ext], type="ERROR") sys.exit() # --------------------------------------------------------------- # Check directory # --------------------------------------------------------------- outputdir = os.path.dirname(os.path.abspath(string)) if not os.path.exists(outputdir): if 'w' in self._mode: message("Directory not found. Creating.", type="WARNING") os.makedirs(outputdir) # --------------------------------------------------------------- # Check format # --------------------------------------------------------------- # if bed3, bed4, bad5 convert to bed6 if self._mode == 'r': if self.file_ext == 'bed': message("Checking BED file format (" + string + ").", type="INFO") try: file_bo = BedTool(string) nb_line = len(file_bo) except: msg = "Unable to load file: " + string + "." message(msg, type="ERROR") sys.exit() if nb_line == 0: msg = "It seems that file " + string + " is empty." message(msg, type="ERROR") sys.exit() if file_bo.file_type != 'bed': msg = "File {f} is not a valid bed file." msg = msg.format(f=string) message(msg, type="ERROR") sys.exit() region_nb = 0 field_count = file_bo.field_count() if field_count != 6: message("Converting to bed6 format (" + string + ").", type="WARNING") tmp_file = make_tmp_file(prefix="bed6_", suffix=".bed") for record in file_bo: region_nb += 1 if field_count < 4: name = 'region_' + str(region_nb) else: name = record.name fields = record.fields[0:3] fields += [name, '0', '.'] tmp_file.write("\t".join(fields) + "\n") close_properly(tmp_file) string = tmp_file.name # we will work with string if 'w' in self._mode: self._mode = 'w' return super(FormattedFile, self).__call__(string)
def bw_cov_mp(bw_list=None, region_file=None, labels=None, bin_nb=None, nb_proc=None, n_highest=None, zero_to_na=False, pseudo_count=None, stat='mean', verbose=False): """ Compute bigwig coverage (multi-processed) for a set of regions. :param bw_list: the list of bigWig files to be processed. :param region_file: the bed file containing the region for which coverage is to be computed. :param labels: shortname for bigwigs. :param bin_nb: The number of bin into which the region should be splitted. :param nb_proc: Number of threads to be used. :param n_highest: compute the mean coverage based on the n highest values in the bins. :param pseudo_count: The value for a pseudo-count. :param verbose: run in verbose mode. :param stat: mean (default) or sum. :param zero_to_na: Convert missing values to NA, not zero. Returns a file. """ n_region_to_proceed = len(BedTool(region_file.name)) message("Received " + str(n_region_to_proceed) + " regions to proceed for each bigwig") tokens = intervals(list(range(n_region_to_proceed)), nb_proc) pool = multiprocessing.Pool(nb_proc) coverage_list = pool.map_async( _big_wig_coverage_worker, list( zip(tokens, repeat(bw_list), repeat(region_file.name), repeat(bin_nb), repeat(pseudo_count), repeat(n_highest), repeat(False), repeat(False), repeat(None), repeat(labels), repeat(zero_to_na), repeat(stat), repeat(verbose)))).get(9999999) if False in coverage_list: sys.stderr.write("Aborting...") sys.exit() # Unlist the list of list coverage_list = [item for sublist in coverage_list for item in sublist] tmp_file = make_tmp_file(prefix="region_coverage", suffix=".bed") for i in coverage_list: tmp_file.write(i) tmp_file.close() return open(tmp_file.name)
def get_ceas_records(inputfile=None, outputfile=None, show_tables=False, target_table='GeneTable'): """ Convert a CEAS sqlite file back into a flat file. """ # ---------------------------------------------------------------------- # load the CEAS file # ---------------------------------------------------------------------- if inputfile.name.endswith('gz'): tmp_file = make_tmp_file(prefix='ceas_gunzip', suffix='.txt') with gzip.open(inputfile.name, 'rb') as f_in: with open(tmp_file.name, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) inputfile = open(tmp_file.name) conn = sqlite3.connect(inputfile.name) cursor = conn.cursor() # ---------------------------------------------------------------------- # A func to get the list of tables # ---------------------------------------------------------------------- def get_tables(cursor): out_list = list() cursor.execute('SELECT name from sqlite_master where type= "table"') for rec in cursor.fetchall(): out_list += [rec[0]] return out_list # ---------------------------------------------------------------------- # Get table list # ---------------------------------------------------------------------- tables = get_tables(cursor) # ---------------------------------------------------------------------- # To show table # ---------------------------------------------------------------------- if show_tables: for tab in tables: outputfile.write(tab + "\n") sys.exit() # ---------------------------------------------------------------------- # loop through records # Each line contains: # chrom,name,strand,txStart,txEnd,cdsStart, # cdsEnd,exonCount,exonStarts,exonEnds,name # ---------------------------------------------------------------------- # Check tables exists if target_table not in tables: message('Table is undefined', type="ERROR") for rec in cursor.execute('SELECT * FROM % s' % target_table): for rec in cursor.fetchall(): out_list = [] for elemnt in rec: out_list += [str(elemnt)] outputfile.write("\t".join(out_list) + "\n")
def get_tx_seq(inputfile=None, outputfile=None, genome=None, with_introns=False, delete_version=False, del_chr=False, separator="", no_rev_comp=False, label="", sleuth_format=True, explicit=True, assembly="bla"): """ Description: Get transcripts sequences in fasta format from a GTF file. """ # ----------------------------------------------------------- # Check chromosomes in fasta file # ----------------------------------------------------------- genome_chr_list = [] message("%d fasta files found." % len(genome)) as_gz_ext = [True for x in genome if x.name.endswith(".gz")] if any(as_gz_ext): message("Genome in gz format is not currently supported.", type="ERROR") if len(genome) == 1: message("Checking fasta file chromosome list") genome = genome[0] with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] else: message("Merging fasta files") tmp_genome = make_tmp_file(prefix="genome", suffix=".fa") with tmp_genome as tg: for curr_file in genome: message("Merging %s" % curr_file.name) with curr_file as cf: shutil.copyfileobj(cf, tg, 1024 * 1024 * 100) message("Checking fasta file chromosome list") genome = open(tmp_genome.name, "r") with genome as genome_file: for i in genome_file: if i.startswith(">"): i = i.rstrip("\n") genome_chr_list += [i[1:]] rev_comp = not no_rev_comp message("Chromosomes in fasta file: " + ",".join(genome_chr_list)) # ----------------------------------------------------------- # Read gtf # ----------------------------------------------------------- gtf = GTF(inputfile) nb_tx_before = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) # ----------------------------------------------------------- # Select genes falling in chrom defined in the fasta file # ----------------------------------------------------------- message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) message("Selecting chromosome defined in the fasta file") gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list)) message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True))) if len(gtf) == 0: message("No genes were found on chromosomes defined in fasta file.", type="ERROR") nb_tx_after = gtf.extract_data("transcript_id", as_list=True, no_na=True, nr=True) if len(nb_tx_after) != len(nb_tx_before): diff = list(set(nb_tx_before) - set(nb_tx_after)) message("Some transcripts had" " no corresponding chromosome" " in the fasta file: " + ",".join(diff)[0:100] + "...") message("Using genome file: " + genome.name) message("Retrieving fasta sequences from " + genome.name) fasta_seq = gtf.get_sequences(genome=genome.name, intron=with_introns, rev_comp=rev_comp) tx_gtf = gtf.select_by_key("feature", "transcript") if sleuth_format: tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype", as_dict_of_lists=True, hide_undef=False) gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype", as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: gene_id = i.gene_id transcript_id = i.transcript_id chrom = i.chrom gn_bio = gn_biotype[i.gene_id][0] tx_bio = tx_biotype[i.transcript_id][0] if delete_version: transcript_id = re.sub('\.[0-9]+$', '', transcript_id) gene_id = re.sub('\.[0-9]+$', '', gene_id) if del_chr: chrom = chrom.replace('chr', '') header = " ".join([ transcript_id, ":".join([ "chromosome", assembly, chrom, str(i.start), str(i.end), "1" ]), "gene:" + gene_id, "gene_biotype:" + gn_bio, "transcript_biotype:" + tx_bio ]) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") else: tx_info = tx_gtf.extract_data("transcript_id," + label, as_dict_of_lists=True, hide_undef=False) for i in fasta_seq: if not explicit: header = separator.join(tx_info[i.transcript_id]) else: header = [ str(x[0]) + "=" + x[1] for x in zip(label.split(","), tx_info[i.transcript_id]) ] header = separator.join(header) outputfile.write(">" + header + "\n") outputfile.write(i.sequence + "\n") gc.disable() close_properly(outputfile, inputfile)
def great_reg_domains(inputfile=None, outputfile=None, go_id="GO:0003700", species="hsapiens", upstream=1000, downstream=1000, chrom_info=None, distal=1000000, mode='basal_plus_extension', http_proxy=None, https_proxy=None): """ Given a GTF and a GO term, attempt compute labeled regions using GREAT 'association rule'. """ # ------------------------------------------------------------------------- # chrom_len will store the chromosome sizes. # ------------------------------------------------------------------------- chrom_len = chrom_info_as_dict(chrom_info) # ------------------------------------------------------------------------- # Read the GTF # ------------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ------------------------------------------------------------------------- # Get the TSSs -- Extend them by upstream/dowstream # ------------------------------------------------------------------------- message("Defining basal regulatory domains.", type="INFO") basal_reg_bed = gtf.get_tss(name=['gene_id', 'gene_name']).slop( s=True, l=upstream, r=downstream, g=chrom_info.name).sort() basal_reg_bed_file = make_tmp_file(prefix='basal_reg', suffix='.bed') basal_reg_bed.saveas(basal_reg_bed_file.name) if mode == 'basal_plus_extension': # ------------------------------------------------------------------------- # Search for upstream limits of each basal_reg_bed # Here we ignore overlapping basal_reg_bed as the way they # are proceded is not documented in GREAT to our knowledge # ------------------------------------------------------------------------- message("Defining regulatory domains upstream regions.", type="INFO") regulatory_region_start = dict() regulatory_region_end = dict() chroms = dict() strands = dict() basal_reg_bed_upstream = basal_reg_bed.closest( basal_reg_bed, # Ignore features in B that overlap A io=True, # In addition to the closest feature in B report distance # use negative distances to report upstream features. # Report distance with respect to A. # When A is on the - strand, "upstream" means B has a # higher(start, stop). D="a", # Ignore features in B that are downstream of features in A id=True, # How ties are handled. "first" Report the first tie t="first", # Require that the query and the closest hit have different names/gene_ids. N=True) basal_reg_bed_upstream_file = make_tmp_file( prefix='basal_reg_bed_upstream', suffix='.bed') basal_reg_bed_upstream.saveas(basal_reg_bed_upstream_file.name) for line in basal_reg_bed_upstream: gene_id = line.name strand = line.strand end = line.end start = line.start gene_id = "|".join([gene_id, str(start), str(end), strand]) chroms[gene_id] = line.chrom strands[gene_id] = strand if strand == '+': # if the feature chromosome in B is # '.' we have reached the start of the chr if line.fields[6] == '.': regulatory_region_start[gene_id] = max( 0, line.start - distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_start[gene_id] = line.start - padding elif strand == '-': # if the feature chromosome in B is # '.' we have reached the end of the chr if line.fields[6] == '.': regulatory_region_end[gene_id] = min( int(chrom_len[line.chrom]), line.end + distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_end[gene_id] = line.end + padding else: message("Cannot process genes without strand", type="WARNING") message("Please check:" + gene_id, type="ERROR") # ------------------------------------------------------------------------- # Search for downstream limits of each basal_reg_bed # Here we ignore overlapping basal_reg_bed as the way they # are proceded is not documented in GREAT to our knowledge # ------------------------------------------------------------------------- message("Defining regulatory domains downstream regions.", type="INFO") basal_reg_bed_downstream = basal_reg_bed.closest( basal_reg_bed, # Ignore features in B that overlap A io=True, # In addition to the closest feature in B report distance # use negative distances to report upstream features. # Report distance with respect to A. # When A is on the - strand, "upstream" means B has a # higher(start, stop). D="a", # Ignore features in B that are upstream of features in A iu=True, # How ties are handled. "first" Report the first tie t="first", # Require that the query and the closest hit have different names/gene_ids. N=True) basal_reg_bed_downstream_file = make_tmp_file( prefix='basal_reg_bed_upstream', suffix='.bed') basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name) for line in basal_reg_bed_downstream: gene_id = line.name strand = line.strand end = line.end start = line.start gene_id = "|".join([gene_id, str(start), str(end), strand]) chroms[gene_id] = line.chrom strands[gene_id] = strand if strand == '+': # if the feature chromosome in B is # '.' we have reached the start of the chr if line.fields[6] == '.': regulatory_region_end[gene_id] = min( int(chrom_len[line.chrom]), line.end + distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_end[gene_id] = line.end + padding elif strand == '-': if line.fields[6] == '.': # sys.stderr.write(str(line.start - distal + 1) + "\n") # sys.stderr.write(gene_id + "\n") regulatory_region_start[gene_id] = max( 0, line.start - distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_start[gene_id] = max( 0, line.start - padding) else: message("Cannot process genes without strand", type="WARNING") message("Please check:" + gene_id, type="ERROR") # print(regulatory_region_start) else: message( "Only 'basal_plus_extension' association rule is currently supported.", type='ERROR') # ------------------------------------------------------------------------- # Print the regulatory regions of all genes # By default print all genes # ------------------------------------------------------------------------- if go_id is None: for gene_id in regulatory_region_start: outlist = [ chroms[gene_id], str(regulatory_region_start[gene_id]), str(regulatory_region_end[gene_id]), gene_id.split("|")[0], "0", strands[gene_id] ] outputfile.write("\t".join(outlist) + "\n") else: # ------------------------------------------------------------------------- # Get the list of gene/transcript associated with a particular GO term # ------------------------------------------------------------------------- message("Getting Gene Ontology annotations.") if not go_id.startswith("GO:"): go_id = "GO:" + go_id is_associated = set() bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy) bm.get_datasets('ENSEMBL_MART_ENSEMBL') if species + "_gene_ensembl" not in bm.datasets: message("Unknow dataset/species.", type="ERROR") bm.query({'query': XML.format(species=species, go=go_id)}) for i in bm.response.content.decode().split("\n"): i = i.rstrip("\n") if i != '': is_associated.add(i) for gene_id in regulatory_region_start: gene_id_short = gene_id.split("|")[0] if gene_id_short in is_associated: outlist = [ chroms[gene_id], str(regulatory_region_start[gene_id]), str(regulatory_region_end[gene_id]), gene_id.split("|")[0], "0", strands[gene_id] ] outputfile.write("\t".join(outlist) + "\n")
def coverage( inputfile=None, outputfile=None, bw_list=None, labels=None, pseudo_count=1, nb_window=1, ft_type="promoter", n_highest=None, downstream=1000, key_name="cov", zero_to_na=False, name_column=None, upstream=1000, chrom_info=None, nb_proc=1, matrix_out=False, stat='mean'): """ Compute transcript coverage with one or several bigWig. """ # ------------------------------------------------------------------------- # Create a list of labels. # Take user input in account # ------------------------------------------------------------------------- bw_list = [x.name for x in bw_list] if len(bw_list) != len(set(bw_list)): message("Found the same bigwigs several times.", type="ERROR") message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bw_list): message("The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bw_list)): labels += [ os.path.splitext( os.path.basename( bw_list[i]))[0]] # ------------------------------------------------------------------------- # Check the number of windows # # ------------------------------------------------------------------------- if n_highest is None: n_highest = nb_window message('Number of bins: %d' % nb_window) message('N highest values: %d' % n_highest) if n_highest > nb_window: message('The number of window used for computing the score' ' (-n) can not be greater than the number of' ' windows (-w)', type="ERROR") sys.exit() # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False # ------------------------------------------------------------------------- # Get regions of interest # # ------------------------------------------------------------------------- name_column = name_column.split(",") if is_gtf: message("Getting regions of interest...") if ft_type.lower() == "intergenic": region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() == "intron": region_bo = gtf.get_introns().slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type == "intron_by_tx": region_bo = gtf.get_introns(by_transcript=True, name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["promoter", "tss"]: region_bo = gtf.get_tss(name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["tts", "terminator"]: region_bo = gtf.get_tts(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() else: region_bo = gtf.select_by_key( "feature", ft_type, 0 ).to_bed(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") else: region_bo = region_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() region_bed = make_tmp_file(prefix="region", suffix=".bed") region_bo.saveas(region_bed.name) # ------------------------------------------------------------------------- # Compute coverage # # ------------------------------------------------------------------------- result_bed = bw_cov_mp(bw_list=bw_list, region_file=open(region_bed.name), labels=labels, bin_nb=nb_window, pseudo_count=pseudo_count, zero_to_na=zero_to_na, nb_proc=nb_proc, n_highest=n_highest, stat=stat, verbose=pygtftk.utils.VERBOSITY) if matrix_out: result_bed.close() df_first = pd.read_csv(result_bed.name, sep="\t", header=None) df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]] df_list = [] for i in range(len(labels)): # create a sub data frame containing the coverage values of the # current bwig str_to_find = r"^" + labels[i] + r"\|" tmp_df = df_first[df_first[3].str.match(str_to_find)].copy() to_replace = r"^" + labels[i] + r"\|" tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace, r"", regex=True) df_list += [tmp_df] df_final = df_list.pop(0) for i in df_list: # Add columns to df final by joining on # chrom, start, end, transcript_id, strand df_final = df_final.merge(i.iloc[:, list(range(6))], on=[0, 1, 2, 3, 5]) df_final.columns = ["chrom", "start", "end", "name", "strand"] + labels df_final.to_csv(outputfile, sep="\t", index=False) else: nb_line = 0 for i in result_bed: outputfile.write(i) nb_line += 1 if nb_line == 0: message("No line available in output...", type="ERROR") gc.disable() close_properly(inputfile, outputfile)
def convergent(inputfile=None, outputfile=None, upstream=1500, downstream=1500, chrom_info=None): """ Find transcript with convergent tts. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_to_convergent_nm = dict() dist_to_convergent = dict() tts_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tts coordinates.") tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||") # get tts position for i in tts_bo: tx_id_ov, gn_id_ov = i.name.split("||") tts_pos[tx_id_ov] = int(i.start) message("Getting tts coordinates.") tts_region_bo = tts_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") tts_intersect_bo = tts_region_bo.intersect(tts_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("tts_slop", ".bed") tts_region_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed") tts_intersect_bo.saveas(tmp_file.name) for i in tts_intersect_bo: tx_id_main, gene_id_main = i.fields[3].split("||") tx_id_ov, gn_id_ov = i.fields[9].split("||") if gene_id_main != gn_id_ov: if tx_id_main in tx_to_convergent_nm: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) if dist < dist_to_convergent[tx_id_main]: dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov else: dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov]) dist_to_convergent[tx_id_main] = dist tx_to_convergent_nm[tx_id_main] = tx_id_ov if len(tx_to_convergent_nm): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_to_convergent_nm, new_key="convergent") gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_convergent, new_key="dist_to_convergent") gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def feature_size(inputfile=None, outputfile=None, ft_type="transcript", names="transcript_id", key_name='feature_size', separator="|", bed=False): """ Get the size and limits (start/end) of features enclosed in the GTF. If bed format is requested returns the limits zero-based half open and the size as a score. Otherwise output GTF file with 'feat_size' as a new key and size as value. """ message("Computing feature sizes.") gtf = GTF(inputfile) feat_list = gtf.get_feature_list(nr=True) + ['mature_rna'] if ft_type not in feat_list + ["*"]: message("Unable to find requested feature.", type="ERROR") names = names.split(",") if ft_type != 'mature_rna': if bed: bed_obj = gtf.select_by_key("feature", ft_type).to_bed(name=names, sep=separator, add_feature_type=True) for i in bed_obj: i.score = str(i.end - i.start) write_properly(chomp(str(i)), outputfile) else: tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt") elmt = gtf.extract_data("feature,start,end", as_list_of_list=True, no_na=False, hide_undef=False) for i in elmt: if i[0] != ft_type and ft_type != "*": tmp_file.write("?\n") else: tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n") tmp_file.close() gtf.add_attr_column(tmp_file, key_name).write(outputfile, gc_off=True) else: tx_size = gtf.get_transcript_size() if bed: bed_obj = gtf.select_by_key("feature", 'transcript').to_bed( ['transcript_id'] + names, add_feature_type=False, sep=separator, more_name=['mature_rna']) for i in bed_obj: names = i.name.split(separator) tx_id = names.pop(0) i.score = tx_size[tx_id] i.name = separator.join(names) write_properly(chomp(str(i)), outputfile) else: if len(tx_size): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_size, new_key=key_name) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def get_midpoints(self): """Returns a bedtools object containing the midpoints of features. :Example: >>> from pygtftk import bedtool_extension >>> fromscratch1 = bedtool_extension.BedTool('chrX 0 100', from_string=True) >>> for i in fromscratch1.get_midpoints(): pass >>> assert i.start == 49 >>> assert i.end == 51 >>> fromscratch1 = bedtool_extension.BedTool('chrX 0 101', from_string=True) >>> for i in fromscratch1.get_midpoints(): pass >>> assert i.start == 50 >>> assert i.end == 51 """ message("Calling 'get_midpoints'.", type="DEBUG") midpoints_bed = make_tmp_file("Midpoints", ".bed") n = 1 for line in self: if line.name == ".": name = str(n) else: name = line.name if line.strand == ".": strand = "+" else: strand = line.strand if line.score == ".": score = "." else: score = line.score diff = line.end - line.start if diff % 2 != 0: # e.g 10-13 (zero based) -> 11-13 one based # mipoint is 12 (one-based) -> 11-12 (zero based) # e.g 949-1100 (zero based) -> 950-1100 one based # mipoint is 1025 (one-based) -> 1024-1025 (zero based) # floored division (python 2)... line.end = line.start + int(diff // 2) + 1 line.start = line.end - 1 else: # e.g 10-14 (zero based) -> 11-14 one based # mipoint is 12-13 (one-based) -> 11-13 (zero based) # e.g 9-5100 (zero based) -> 10-5100 one based # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based) # floored division (python 2)... # No real center. Take both line.start = line.start + int(diff // 2) - 1 line.end = line.start + 2 midpoints_bed.write("\t".join( [line.chrom, str(line.start), str(line.end), name, score, strand]) + "\n") n += 1 midpoints_bed.close() return BedTool(fn=midpoints_bed.name)
def tss_numbering(inputfile=None, outputfile=None, compute_dist=False, key_name='tss_number', key_name_dist='dist_to_first_tss', add_nb_tss_to_gene=False, gene_key='nb_tss'): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id"], as_dict=True) tx_to_gn = gtf.get_tx_to_gn() for k in tss: gn_id = tx_to_gn[k] gn_tss_dist[gn_id][k] = int(tss[k]) # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict # that maps gene_id to transcript_id and transcript_id to TSS # numbering (1 for most 5', then 2...). For transcripts having # the same TSSs, the tss number will be the same. gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Numbering TSSs.") tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt') gn_how_many_tss = dict() for gn_id in gn_to_tx_to_tss: for tx_id in gn_to_tx_to_tss[gn_id]: tss_num = str(gn_to_tx_to_tss[gn_id][tx_id]) tss_number_file.write(tx_id + "\t" + tss_num + "\n") if gn_id not in gn_how_many_tss: gn_how_many_tss[gn_id] = tss_num else: if int(tss_num) > int(gn_how_many_tss[gn_id]): gn_how_many_tss[gn_id] = tss_num tss_number_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name, inputfile=open(tss_number_file.name), has_header=False) if add_nb_tss_to_gene: gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss', suffix='.txt') for a_key, a_val in gn_how_many_tss.items(): gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n") gn_how_many_tss_file.close() gtf = gtf.add_attr_from_file(feat='gene', key='gene_id', new_key=gene_key, inputfile=open(gn_how_many_tss_file.name), has_header=False) if compute_dist: gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True) tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss', suffix='.txt') for gn_id in gn_to_tx_to_tss: tx_list = gn_to_tx_ordered_by_tss[gn_id] tx_first = tx_list.pop(0) # The first tss as distance 0 to the # first tss... tss_dist_file.write(tx_first + "\t0\n") for tx_id in tx_list: dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id])) tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n") tss_dist_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name_dist, inputfile=open(tss_dist_file.name), has_header=False) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def divergent( inputfile=None, outputfile=None, key_name=None, upstream=1500, downstream=1500, chrom_info=None, no_strandness=False, no_annotation=False): """ Find transcript with divergent promoters. """ message("Using -u " + str(upstream) + ".") message("Using -d " + str(downstream) + ".") tx_with_divergent = dict() dist_to_divergent = dict() tss_pos = dict() message("Loading GTF.") gtf = GTF(inputfile) message("Getting transcript coordinates.") tx_feat = gtf.select_by_key("feature", "transcript") message("Getting tss coordinates.") tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"], sep="||") # get tss position for i in tss_bo: tx_id_tss, gn_id_tss = i.name.split("||") tss_pos[tx_id_tss] = int(i.start) message("Getting promoter coordinates.") promoter_bo = tss_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).cut([0, 1, 2, 3, 4, 5]) message("Intersecting...") if no_strandness: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=False) else: prom_with_tss_bo = promoter_bo.intersect(tss_bo, wb=True, s=False, S=True) tmp_file = make_tmp_file("promoter_slop", ".bed") promoter_bo.saveas(tmp_file.name) tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed") prom_with_tss_bo.saveas(tmp_file.name) for i in prom_with_tss_bo: tx_id_tss, gn_id_tss = i.fields[9].split("||") tx_id_prom, gene_id_prom = i.fields[3].split("||") if gene_id_prom != gn_id_tss: if tx_id_prom in tx_with_divergent: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) if dist < dist_to_divergent[tx_id_prom]: dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss else: dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss]) dist_to_divergent[tx_id_prom] = dist tx_with_divergent[tx_id_prom] = tx_id_tss if not no_annotation: if key_name is None: key_name = "divergent" key_name_dist = "dist_to_divergent" else: key_name_dist = "dist_" + key_name if len(tx_with_divergent): gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=tx_with_divergent, new_key=key_name) gtf = gtf.add_attr_from_dict(feat="transcript", key="transcript_id", a_dict=dist_to_divergent, new_key=key_name_dist) gtf.write(outputfile, gc_off=True) else: gtf.select_by_key("transcript_id", ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def rm_dup_tss(inputfile=None, outputfile=None): """If several transcripts of a gene share the same tss, select only one.""" # ---------------------------------------------------------------------- # Get the TSS # ---------------------------------------------------------------------- gtf = GTF(inputfile) tss_bo = gtf.get_tss(["gene_id", "transcript_id"]) # ---------------------------------------------------------------------- # Sort the file by name (4th col) to ensure reproducibility between calls. # ---------------------------------------------------------------------- with open(tss_bo.fn) as f: lines = [line.split('\t') for line in f] tmp_file = make_tmp_file(prefix="tss_sorted_by_tx_id", suffix=".bed") for line in sorted(lines, key=operator.itemgetter(3)): tmp_file.write('\t'.join(line)) tmp_file.close() tss_bo = BedTool(tmp_file.name) # ---------------------------------------------------------------------- # Get the list of non redundant TSSs # ---------------------------------------------------------------------- gene_dict = defaultdict(dict) to_delete = [] message("Looking for redundant TSS (gene-wise).") for line in tss_bo: tss = line.start name = line.name gene_id, tx_id = name.split("|") if gene_id in gene_dict: if tss not in gene_dict[gene_id]: gene_dict[gene_id][tss] = tx_id else: to_delete += [tx_id] else: gene_dict[gene_id][tss] = tx_id message("Deleted transcripts: " + ",".join(to_delete[1:min(10, len(to_delete))]) + "...", type="DEBUG") # ---------------------------------------------------------------------- # Write # ---------------------------------------------------------------------- gtf.select_by_key("feature", "gene", invert_match=True).select_by_key( "transcript_id", ",".join(to_delete), invert_match=True).write(outputfile, gc_off=True)