def annotate_peaks(peaks, ref_path): """ peak to gene annotation strategy: 1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak 2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak 3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak This step is optional 4. call it an intergenic peak """ ref_mgr = ReferenceManager(ref_path) tss = BedTool(ref_mgr.tss_track) # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites if tss.field_count() == 7: tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tss = tss.to_dataframe() df_tss['gene_type'] = '.' tss_filtered = BedTool.from_dataframe(df_tss).saveas() # including transcripts.bed is optional if ref_mgr.transcripts_track is None: transcripts_filtered = BedTool([]) else: transcripts = BedTool(ref_mgr.transcripts_track) if transcripts.field_count() == 7: transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tx = transcripts.to_dataframe() df_tx['gene_type'] = '.' transcripts_filtered = BedTool.from_dataframe(df_tx).saveas() # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas() results = [] peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas() # avoid error when no peaks overlap with any transcipts if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss): peaks_nearby_tss_and_tx = peaks_nearby_tss \ .intersect(transcripts_filtered, wa=True, wb=True) \ .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct']) for peak in peaks_nearby_tss_and_tx: results.append(get_peak_nearby_genes(peak)) for peak in peaks_nearby_tss_butno_tx: results.append(get_peak_nearby_genes(peak)) return results
def xstream(a, b, distance, updown, out): """ find all things in b that are within distance of a in the given direction (up or down-stream) """ direction = dict(u="l", d="r")[updown[0]] kwargs = {'sw':True, direction: distance} if "l" in kwargs: kwargs["r"] = 0 else: kwargs["l"] = 0 a = BedTool(a).saveas() kwargs['stream'] = True c = a.window(b, **kwargs) afields = a.field_count() seen = collections.defaultdict(set) for feat in c: key = "\t".join(feat[:afields]) # keep track of all the feature names that overlap this one seen[key].update((feat[afields + 3],)) # the entries that did appear in the window for row in seen: out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n") # write the entries that did not appear in the window'ed Bed for row in a: key = "\t".join(row[:afields]) if key in seen: continue out.write(str(row) + "\t.\n") out.flush() assert len(BedTool(out.name)) == len(a)
def _iter_pairwise_connections( clusterable_bedtool: pybedtools.BedTool, min_reciprocal_overlap: float, min_sample_overlap: float = 0, is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({}) ) -> Iterator[Tuple[Text, Text]]: """ Iterate over pairs of variant intervals that meet minimum requirement for reciprocal overlap. Exclude self-overlaps. Optionally impose requirement of minimum Jaccard index for carrier samples. Parameters ---------- clusterable_bedtool: BedTool bed object with intervals that may overlap each other min_reciprocal_overlap: float minimum reciprocal overlap for two intervals to be connected min_sample_overlap: float (default=0) minimum Jaccard index of carrier samples for two intervals to be connected is_carrier: Mapping[Text, numpy.ndarray] map from variant ID to carrier status (array boolean True/False for each sample) Yields ------- variant_id_1, variant_id_2: Tuple[Text, Text] successive pairs of variant IDs that meet the overlap requiremnts """ # Cluster intervals based on reciprocal overlap if len(clusterable_bedtool) == 0: return overlap_bedtool = clusterable_bedtool.intersect(clusterable_bedtool, f=min_reciprocal_overlap, r=True, wa=True, wb=True, sorted=True, nonamecheck=True) num_1_fields = clusterable_bedtool.field_count() name_1_field = name_field sv_type_1_field = sv_type_field name_2_field = num_1_fields + name_field sv_type_2_field = num_1_fields + sv_type_field if min_sample_overlap > 0: for overlap in overlap_bedtool: fields = overlap.fields if fields[sv_type_1_field] != fields[sv_type_2_field]: continue # only cluster same sv_type name_1 = fields[name_1_field] name_2 = fields[name_2_field] if name_1 != name_2 and jaccard_index( is_carrier[name_1], is_carrier[name_2]) >= min_sample_overlap: yield name_1, name_2 else: for overlap in overlap_bedtool: fields = overlap.fields if fields[sv_type_1_field] != fields[sv_type_2_field]: continue # only cluster same sv_type name_1 = fields[name_1_field] name_2 = fields[name_2_field] if name_1 != name_2: yield name_1, name_2
def xstream(a, b, distance, updown, out): """ find all things in b that are within distance of a in the given direction (up or down-stream) """ direction = dict(u="l", d="r")[updown[0]] kwargs = {'sw': True, direction: distance} if "l" in kwargs: kwargs["r"] = 0 else: kwargs["l"] = 0 a = BedTool(a).saveas() kwargs['stream'] = True c = a.window(b, **kwargs) afields = a.field_count() seen = collections.defaultdict(set) for feat in c: key = "\t".join(feat[:afields]) # keep track of all the feature names that overlap this one seen[key].update((feat[afields + 3], )) # the entries that did appear in the window for row in seen: out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n") # write the entries that did not appear in the window'ed Bed for row in a: key = "\t".join(row[:afields]) if key in seen: continue out.write(str(row) + "\t.\n") out.flush() assert len(BedTool(out.name)) == len(a)
def get_annotation_gene_types(args): """ Return the gene types to use to filter genes/transcript annotations by. """ ref_mgr = ReferenceManager(args.reference_path) tss = BedTool(ref_mgr.tss_track) if tss.field_count() == 7: return TRANSCRIPT_ANNOTATION_GENE_TYPES else: return None
def extract_ge_folchange_per_peak(peaks, tables, closestMapping, features, IdColumn, hm): """ Updates the values on the input matrix by appending the requested values from the given tables when closest genes have been found. """ ## keyMap_closest: peak_key:gene_id ## peak_keys: cols 1-7 from bed format (1-based index) Peaks = BedTool(peaks) Peaks=Peaks.sort() field_count = Peaks.field_count() keyMap_closest = keymap_from_closest_genes(closestMapping, peaks, field_count) __update_matrix_values(peaks, keyMap_closest, tables,features,IdColumn,hm)
def add_closest(aname, bname): a, b = BedTool(aname), BedTool(bname) afields = a.field_count() c = a.closest(b, d=True) get_name = gen_get_name(b, afields) dbed = open(BedTool._tmp(), "w") # keep the name and distance seen_by_line = collections.defaultdict(list) for feat in c: key = "\t".join(feat[:afields]) seen_by_line[key].append([feat[-1], get_name(feat)]) for key, dist_names in seen_by_line.iteritems(): if len(dist_names) > 0: assert len(set([d[0] for d in dist_names])) == 1 names = ",".join(sorted(set(d[1] for d in dist_names))) new_line = "\t".join([key] + [names] + [dist_names[0][0]]) dbed.write(new_line + "\n") dbed.close() d = BedTool(dbed.name) assert len(d) == len(a) return d
def add_closest(aname, bname): a, b = BedTool(aname), BedTool(bname) afields = a.field_count() c = a.closest(b, d=True) get_name = gen_get_name(b, afields) dbed = open(BedTool._tmp(), "w") # keep the name and distance seen_by_line = collections.defaultdict(list) for feat in c: key = "\t".join(feat[:afields]) seen_by_line[key].append([feat[-1], get_name(feat)]) for key, dist_names in seen_by_line.items(): if len(dist_names) > 0: assert len(set([d[0] for d in dist_names])) == 1 names = ",".join(sorted(set(d[1] for d in dist_names))) new_line = "\t".join([key] + [names] + [dist_names[0][0]]) dbed.write(new_line + "\n") dbed.close() d = BedTool(dbed.name) assert len(d) == len(a) return d
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed( clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count( ) == 3 or reannotate: debug( 'Annotating target BED file and collecting overlapping genome features' ) overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname( work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix( join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def mk_matrix(inputfile=None, outputfile=None, bigwiglist=None, ft_type=None, pseudo_count=0, upstream=1000, downstream=1000, bin_around_frac=0.1, chrom_info=None, bin_nb=100, nb_proc=None, labels=None, no_stranded=False, zero_to_na=False): """ Description: Create a matrix to be used by 'profile' and 'heatmap' commands. """ # ------------------------------------------------------------------------- # Check argument consistency # # ------------------------------------------------------------------------- if ft_type in ['single_nuc', 'promoter', 'tts']: region_size = upstream + downstream + 1 if region_size < bin_nb: message( "The region (-u/-d) needs to be extended given the number " "of bins (--bin-nb)", type="ERROR") # ------------------------------------------------------------------------- # Check output file name does not ends with .zip # # ------------------------------------------------------------------------- if outputfile.name.endswith(".zip"): outfn = outputfile.name.replace(".zip", "") outputfile = open(outfn, "w") # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True if ft_type == 'user_regions': message( "--ft-type can not be set to user_regions" " when a gtf is provided.", type="ERROR") else: try: region_bo = BedTool(inputfile.name) len(region_bo) except IndexError: message("Unable to read the input file. Check format", type="ERROR") if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': message('Loading the GTF file.') gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False if ft_type != 'user_regions' and ft_type != 'single_nuc': message( "Set --ft-type to 'user_regions' or 'single_nuc'" " when using input bed file.", type="ERROR") # Check that the strand is provided and # check it is located in the right column # (not checked by BedTool...). if region_bo.field_count() < 6: if not no_stranded: message("Strand is undefined. Use -nst.", type="ERROR") else: region_name = dict() for i in region_bo: if region_name.get(i.name, None) is None: region_name[i.name] = 1 else: message( "Regions in bed file should have " "unique identifier (col 4).", type="ERROR") if i.strand[0] not in ['.', '+', '-']: message("Strand should be one of '+','-' or '.'.", type="ERROR") if ft_type == 'single_nuc': if i.end - i.start != 1: message( "Region length should be 1 nucleotide " "long when 'single_nuc' is set. Use 'user_regions'.", type="ERROR") elif ft_type == 'user_regions': if i.end - i.start == 1: message( "Region length should not be 1 nucleotide " "long when 'user_regions' is set. Use 'single_nuc'.", type="ERROR") # ------------------------------------------------------------------------- # Create a list of labels for the diagrams. # Take user input in account # ------------------------------------------------------------------------- message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bigwiglist): message( "The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bigwiglist)): labels += [ os.path.splitext(os.path.basename(bigwiglist[i].name))[0] ] # ------------------------------------------------------------------------- # # Get the requested transcrit lines in bed format # Tx are restricted to those found on chromosome # declared in the bigwig file. # ------------------------------------------------------------------------- message('Getting the list of chromosomes declared in bigwig files.') bw_chrom = list() for i in bigwiglist: bw_chrom += list(pyBigWig.open(i.name).chroms().keys()) bed_col = [0, 1, 2, 3, 4, 5] if is_gtf: message('Selecting chromosomes declared in bigwig from gtf.') tmp = gtf.select_by_key("feature", "transcript").select_by_key( "seqid", ",".join(bw_chrom)) tmp = gtf.select_by_key("feature", "transcript") tmp_tx_name = tmp.extract_data("transcript_id", as_list=True) # If several trancript records are associated to # the same transcript_id, raise an error. if len(tmp_tx_name) > len(set(tmp_tx_name)): message('Transcripts should have a unique identifier.', type="ERROR") message('Selecting requested regions.') # ---------------------------------------------------------------------- # # Slop tss and promoters. # No need if transcript was requested (it will be flanked by upstream # and doswnstream regions later on). # ---------------------------------------------------------------------- if ft_type == 'transcript': message("Getting transcript boundaries (input gtf).") main_region_bo = tmp.to_bed(name=["transcript_id"]) elif ft_type == 'promoter': message("Getting promoter regions [-%d,+%d]." % (upstream, downstream)) main_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) elif ft_type == 'tts': main_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Loading regions") if ft_type == 'user_regions': main_region_bo = BedTool(inputfile.name).cut(bed_col) elif ft_type == 'single_nuc': main_region_bo = BedTool(inputfile.name).cut(bed_col).slop( s=True, l=upstream, r=downstream, g=chrom_info.name) else: message("Unknown method.") # Save for tracability main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed") main_region_bo.saveas(main_region_bed.name) # ------------------------------------------------------------------------- # # Print a header in the output file # # ------------------------------------------------------------------------- message("Preparing comments") comments = "#" comments += "ft_type:" + ft_type + ";" comments += "from:" + str(upstream) + ";" comments += "to:" + str(downstream) + ";" comments += "labels:" + ",".join(labels) + ";" # ------------------------------------------------------------------------- # Compute coverage of requested region # Each worker will send a file # ------------------------------------------------------------------------- outputfile_list = {} message("Using %d bins for main region." % bin_nb) tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="main", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["main"] = tmp_file # ------------------------------------------------------------------------- # If transcript was requested # we must process flanking regions # We need to retrieve coverage of promoter [-upstream, 0] # as transcript coverage window size will depend on transcript length. # For promoter the length of windows will be fixed. # ------------------------------------------------------------------------- if ft_type in ['transcript', 'user_regions']: # Number of bins for TTS and TSS around_bin_nb = int(round(bin_nb * bin_around_frac)) if around_bin_nb < 1: around_bin_nb = 1 if upstream > 0: if ft_type == 'transcript': message("Getting promoter (using %d bins)." % around_bin_nb) ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop( s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col) else: message("Getting upstream regions (%d bins)." % around_bin_nb) ups_region_bo = main_region_bo.flank(s=True, l=upstream, r=0, g=chrom_info.name) upstream_bed_file = make_tmp_file(prefix="upstream_region" + ft_type, suffix=".bed") ups_region_bo.saveas(upstream_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="upstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["upstream"] = tmp_file if downstream > 0: if ft_type == 'transcript': message("Getting TTS (using %d bins)." % around_bin_nb) dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop( s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col) else: message("Getting downstream regions (%d bins)." % around_bin_nb) dws_region_bo = main_region_bo.flank(s=True, l=0, r=downstream, g=chrom_info.name) dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type, suffix=".bed") dws_region_bo.saveas(dws_bed_file.name) tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name, nb_proc=nb_proc, big_wig=[x.name for x in bigwiglist], bin_nb=around_bin_nb, pseudo_count=pseudo_count, stranded=not no_stranded, type="downstream", labels=labels, outputfile=outputfile.name, zero_to_na=zero_to_na, verbose=pygtftk.utils.VERBOSITY) outputfile_list["downstream"] = tmp_file # ------------------------------------------------------------------------- # # Merge file using pandas # # ------------------------------------------------------------------------- message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG") df_main = pd.read_csv(outputfile_list["main"].name, sep="\t") # save strand and end # They will re-joined added later df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']] df_start = df_main.pop('start') df_end = df_main.pop('end') if "upstream" in outputfile_list: message("Merging upstream file") message("Reading (pandas): " + outputfile_list["upstream"].name, type="DEBUG") df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t") df_up = df_up.drop(['start', 'end'], 1) df_main = df_up.merge(df_main.loc[:, df_main.columns], on=['bwig', 'chrom', 'gene', 'strand']) if "downstream" in outputfile_list: message("Merging downstream file") message("Reading (pandas): " + outputfile_list["downstream"].name, type="DEBUG") df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t") df_dws = df_dws.drop(['start', 'end'], 1) df_main = df_main.merge(df_dws.loc[:, df_dws.columns], on=['bwig', 'chrom', 'gene', 'strand']) # join start and end. df_main = df_main.merge(df_copy.loc[:, df_copy.columns], on=['bwig', 'chrom', 'gene', 'strand']) df_start = df_main.pop('start') df_end = df_main.pop('end') df_main.insert(2, 'start', df_start) df_main.insert(3, 'end', df_end) message("Writing to file") outputfile.close() with open(outputfile.name, 'a') as f: f.write(comments + "\n") df_main.to_csv(f, sep="\t", index=False, mode='a', columns=df_main.columns, na_rep='NA') # ------------------------------------------------------------------------- # # Compress # # ------------------------------------------------------------------------- message("Compressing") path = os.path.abspath(outputfile.name) filename = os.path.basename(path) message("filename: " + filename, type="DEBUG") zip_filename = filename + '.zip' message("zip_filename: " + zip_filename, type="DEBUG") zip_path = os.path.join(os.path.dirname(path), zip_filename) message("zip_path: " + zip_path, type="DEBUG") with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf: zf.write(filename=path, arcname=filename) for i in outputfile_list: message("deleting " + outputfile_list[i].name) os.remove(outputfile_list[i].name) os.remove(outputfile.name) gc.disable() close_properly(inputfile, outputfile)
def __call__(self, string): # --------------------------------------------------------------- # Check file extension # --------------------------------------------------------------- fasta_format_1 = '(\.[Ff][Aa][Ss][Tt][Aa]$)|(\.[Ff][Nn][Aa]$)' fasta_format_2 = '|(\.[Ff][Aa]$)|(\.[Ff][Aa][Ss]$)|(\.[Ff][Ff][Nn]$)|(\.[Ff][Rr][Nn]$)' fasta_regexp = fasta_format_1 + fasta_format_2 fasta_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", fasta_regexp) bed_regexp = '\.[Bb][Ee][Dd][3456]{0,1}$' bed_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", bed_regexp) gtf_regexp = '\.[Gg][Tt][Ff]$' gtf_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", gtf_regexp) txt_regexp = '(\.[Tt][Xx][Tt]$)|(\.[Cc][Ss][Vv]$)|(\.[Dd][Ss][Vv]$)|(\.[Tt][Aa][Bb]$)|(\.[Tt][Ss][Vv]$)' txt_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", txt_regexp) bigwig_regexp = '(\.[Bb][Ww]$)|(\.[Bb][Ii][Gg][Ww][Ii][Gg]$)' zip_regexp = '\.[Zz][Ii][Pp]$' pdf_regexp = '\.[Pp][Dd][Ff]$' ext2regexp = {'bed': bed_regexp, 'bed.gz': bed_regexp_gz, 'gtf': gtf_regexp, 'gtf.gz': gtf_regexp_gz, 'fasta': fasta_regexp, 'fasta.gz': fasta_regexp_gz, 'txt': txt_regexp, 'txt.gz': txt_regexp_gz, 'bigwig': bigwig_regexp, 'zip': zip_regexp, 'pdf': pdf_regexp} # Set verbosity system wide as depending on # command line argument order, VERBOSITY (-V) can # be evaluated later... if '-V' in sys.argv: sys_args = ' '.join(sys.argv) verbosity_val = re.search('-V ?([01234])?', sys_args) if verbosity_val: pygtftk.utils.VERBOSITY = int(verbosity_val.group(1)) else: pygtftk.utils.VERBOSITY = 0 match = False if isinstance(self.file_ext, str): extension_list = [self.file_ext] else: extension_list = list(self.file_ext) for this_ext in extension_list: if re.search(ext2regexp[this_ext], string): match = True break if not match: message('Not a valid filename extension :' + string, type="WARNING") message('Extension expected: ' + ext2regexp[this_ext], type="ERROR") sys.exit() # --------------------------------------------------------------- # Check directory # --------------------------------------------------------------- outputdir = os.path.dirname(os.path.abspath(string)) if not os.path.exists(outputdir): if 'w' in self._mode: message("Directory not found. Creating.", type="WARNING") os.makedirs(outputdir) # --------------------------------------------------------------- # Check format # --------------------------------------------------------------- # if bed3, bed4, bad5 convert to bed6 if self._mode == 'r': if self.file_ext == 'bed': message("Checking BED file format (" + string + ").", type="INFO") try: file_bo = BedTool(string) nb_line = len(file_bo) except: msg = "Unable to load file: " + string + "." message(msg, type="ERROR") sys.exit() if nb_line == 0: msg = "It seems that file " + string + " is empty." message(msg, type="ERROR") sys.exit() if file_bo.file_type != 'bed': msg = "File {f} is not a valid bed file." msg = msg.format(f=string) message(msg, type="ERROR") sys.exit() region_nb = 0 field_count = file_bo.field_count() if field_count != 6: message("Converting to bed6 format (" + string + ").", type="WARNING") tmp_file = make_tmp_file(prefix="bed6_", suffix=".bed") for record in file_bo: region_nb += 1 if field_count < 4: name = 'region_' + str(region_nb) else: name = record.name fields = record.fields[0:3] fields += [name, '0', '.'] tmp_file.write("\t".join(fields) + "\n") close_properly(tmp_file) string = tmp_file.name # we will work with string if 'w' in self._mode: self._mode = 'w' return super(FormattedFile, self).__call__(string)
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
for k2, i in d.items(): l.append("%s:\t%s:\t%s" % (k1, k2, str(i).strip())); return "\n".join(l) chimeras = BedTool(args.path) if(len(chimeras) == 0): sys.stderr.write("input file \'%s\' is empty\n" % args.path); sys.exit(); exons = BedTool(args.exons) offset = chimeras.field_count(); chimeras_vs_exons = chimeras.intersect(exons, s=args.stranded, wao=True) first = chimeras_vs_exons[0] curname, cur_pair_number = first.name.split("|"); intersections = defaultdict(dict); intervals = OrderedDict(); intervals[int(cur_pair_number)] = first; intersections[first[offset+3]][int(cur_pair_number)] = list2interval(first[offset:]) for i in chimeras_vs_exons[1:]: name, pair_number = i.name.split("|"); if(name == curname): if(pair_number == cur_pair_number):
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate: debug('Annotating target BED file and collecting overlapping genome features') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def bw_coverage(inputfile=None, out_file=None, bw_list=None, pseudo_count=1, score=None, bin_nb=1, n_highest=None, nb_proc=1, verbose=True): """ Compute transcript coverage with one or several bigWig. ------------------------------------------------------- Uses bx-python as interface to kent utilities. """ # Check if the score is well written if not re.search(r"^[b\d\/\*\+\-\(\)\.]+$", score): sys.stderr.write("Score should contain the following characters: " "b0, b1 (...) and operators +, ., -, *, /, **, (, ).") sys.exit(0) # Check if the score to compute fits with # The number of input bigWigs bw_list = bw_list.split(",") bwig_in_score = re.finditer(r"b\d+", score) bwig_expected_in_score = ["b" + str(x) for x in range(len(bw_list))] for i in bwig_in_score: if i.group(0) not in bwig_expected_in_score: sys.stderr.write("The indicated column (" + i.group(0) + ") was not found.") sys.exit(0) # Check the number of windows if n_highest is None: n_highest = bin_nb if verbose: sys.stderr.write("Number of bins: " + str(bin_nb) + "\n") sys.stderr.write("N highest values: " + str(n_highest) + "\n") if n_highest > bin_nb: sys.stderr.write("The number of window used for computing the score" " (-n) can not be greater than the number of" " windows (-w)") sys.exit() # Check input file is in bed6 format region_bo = BedTool(inputfile.name) if region_bo.field_count() != 6: sys.stderr.write( "Bed file should should be in Bed6 format. Use '.' if strand is undefined.\n" ) sys.exit() tokens = intervals(range(len(BedTool(inputfile.name))), nb_proc) pool = multiprocessing.Pool(nb_proc) coverage_list = pool.map_async( big_wig_summary_worker, zip(tokens, repeat(bw_list), repeat(inputfile.name), repeat(bin_nb), repeat(pseudo_count), repeat(n_highest), repeat(verbose))).get(9999999) if False in coverage_list: sys.stderr.write("Aborting...") sys.exit() # Unlist the list of list coverage_list = [item for sublist in coverage_list for item in sublist] # Prepare a data.frame to collect the results dataframe = pd.DataFrame(columns=None) if verbose: sys.stderr.write("Retrieving results.\n") for i in coverage_list: dataframe.ix[i[0], i[1] + str(i[2])] = float(i[3]) if verbose: sys.stderr.write("Computing score.\n") dataframe = dataframe.eval(score) dataframe.to_csv(out_file, sep="\t", header=False) close_properly(inputfile, out_file)
i2 = Interval(interval.chrom, start, stop, interval.name, interval.score, interval.strand) if (interval.strand == '-'): return i2, i1 else: return i1, i2 coverage = coverage2dict(args.coverage) genome = SeqIO.to_dict(SeqIO.parse(args.genome, "fasta")) transcripts = BedTool(args.transcripts) phages = BedTool(args.phages) regions = BedTool(args.path) regions = BedTool([x for x in regions if float(x.score) > args.zscore]) OFFSET = regions.field_count() up_downs = [ get_upstream_downstream(x, genome, args.length) for x in transcripts ] upstreams = BedTool([x[0] for x in up_downs if x[0]]) downstreams = BedTool([x[1] for x in up_downs if x[1]]) phaged_regions = [ regions.intersect(b=phages, u=True, f=0.5), regions.intersect(b=phages, v=True, f=0.5) ] print([len(x) for x in phaged_regions]) already_discovered = set() transcriptome_region_dict = defaultdict(list) ph_names = ['phage', 'non-phage']
exons = []; for interval in BedTool(args.gff3): if('ID' in interval.attrs and interval.attrs['ID'].split(':')[0] == 'gene'): curname = interval.attrs['gene_id'] enames = set() if(interval[2] == 'exon'): if(interval.name not in enames): enames.add(interval.name) interval.name = curname exons.append(gff2bed(interval)) #Get an intersection between circles and expms bed = BedTool(args.path); offset = bed.field_count(); intersection = bed.intersect(b=exons, s=True, wao=True); curname = '' cexons = [] for interval in intersection: if(curname == interval.name): cexons.append(tuple(interval[offset:offset+6])) else: if(curname): get_exons(cinterval, cexons) cinterval = interval cexons = [tuple(interval[offset:offset+6])] curname = interval.name else: get_exons(cinterval, cexons)