def read_aligner_output(rm_out, gtffile, is_stranded, tmp_dir, resume, call_all): """ Use bedtools to get transcripts/genes with multi-mapped reads. Returns a list of transcripts/genes. """ if not (resume and os.path.isfile(tmp_dir + '/gtf2multireads.bed')): rm_bed = pybedtools.BedTool(rm_out) gtf = pybedtools.BedTool(gtffile) gtf_bed_rm = gtf.intersect( rm_bed, s=True, u=True) if is_stranded else gtf.intersect(rm_bed, u=True) gtf_bed_rm.saveas(tmp_dir + '/gtf2multireads.bed') pybedtools.cleanup() tid_list = [] if call_all: gtf_to_read = gtffile else: gtf_to_read = tmp_dir + '/gtf2multireads.bed' with open(gtf_to_read, 'r') as f: for line in f: ele = line.rstrip().split('\t') gene_id = ele[3] gene_chr, gene_start, gene_end = ele[0], int(ele[1]), int(ele[2]) gene_strand = ele[5] tid_list.append( [gene_id, gene_chr, gene_strand, gene_start, gene_end]) print_time_stamp('Read transcripts with multi-reads: ' + str(len(tid_list))) return tid_list
def calculate_coverage(bamfile_name, output_dir): os.makedirs(f'{output_dir}/tmp', exist_ok=True) pybedtools.set_tempdir(f'{output_dir}/tmp') bed = pybedtools.BedTool(bamfile_name) df = bed.genome_coverage(dz = True).to_dataframe(names=['contig','pos', 'depth']) pybedtools.cleanup() return df
def test_cleanup(): """ make sure the tempdir and cleanup work """ assert os.path.abspath(pybedtools.get_tempdir()) == os.path.abspath('.') # make a fake tempfile, not created during this pybedtools session testfn = 'pybedtools.TESTING.tmp' os.system('touch %s' % testfn) assert os.path.exists(testfn) # make some temp files a = pybedtools.BedTool(os.path.join(testdir, 'data', 'a.bed')) b = pybedtools.BedTool(os.path.join(testdir, 'data', 'b.bed')) c = a.intersect(b) # after standard cleanup, c's fn should be gone but the fake one still # there... pybedtools.cleanup(verbose=True) assert os.path.exists(testfn) assert not os.path.exists(c.fn) # Unless we force the removal of all temp files. pybedtools.cleanup(remove_all=True) assert not os.path.exists(testfn) # a.fn and b.fn better be there still! assert os.path.exists(a.fn) assert os.path.exists(b.fn)
def GetRatioGenome(): pulldown = pysam.Samfile(args.pulldown) control = pysam.Samfile(args.control) if args.tot_pulldown is not None: tot_pulldown = args.tot_pulldown tot_control = args.control else: tot_pulldown = pulldown.mapped + pulldown.unmapped tot_control = control.mapped + control.unmapped print >> sys.stderr, "Total number of reads in pulldown sample: %d" % ( tot_pulldown) print >> sys.stderr, "Total number of reads in control sample: %d" % ( tot_control) # get spike-in read within bed file pulldown_bam = pybedtools.BedTool(args.pulldown) control_bam = pybedtools.BedTool(args.control) spike_pulldown = pulldown_bam.intersect(args.pos, f=0.5).count() spike_control = control_bam.intersect(args.pos, f=0.5).count() print >> sys.stderr, "Total number of reads mapped in spike-in in pulldown sample: %d" % ( spike_pulldown) print >> sys.stderr, "Total number of reads mapped in spike-in in control sample: %d" % ( spike_control) ratio = float(spike_control) / float(spike_pulldown) * float( tot_pulldown) / float(tot_control) print >> sys.stderr, "Ratio is %.6f" % (ratio) pulldown.close() control.close() pybedtools.cleanup()
def vcf_to_df_worker(arg): """Convert CANVAS vcf to a dict, single thread""" canvasvcf, exonbed, i = arg logging.debug("Working on job {}: {}".format(i, canvasvcf)) samplekey = op.basename(canvasvcf).split(".")[0].rsplit("_", 1)[0] d = {"SampleKey": samplekey} exons = BedTool(exonbed) cn = parse_segments(canvasvcf) overlaps = exons.intersect(cn, wao=True) gcn_store = {} for ov in overlaps: # Example of ov.fields: # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5', # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene', # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0'] gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5])) if gene_name not in gcn_store: gcn_store[gene_name] = defaultdict(int) cn = ov.fields[-2] if cn == ".": continue cn = int(cn) if cn > 10: cn = 10 amt = int(ov.fields[-1]) gcn_store[gene_name][cn] += amt for k, v in sorted(gcn_store.items()): v_mean, v_median = counter_mean_and_median(v) d[k + ".avgcn"] = v_mean d[k + ".medcn"] = v_median cleanup() return d
def main(): """ Third quick example from the documentation -- count reads introns and exons, in parallel """ ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), usage=__doc__) ap.add_argument('--gff', required=True, help='GFF or GTF file containing annotations') ap.add_argument('--bam', required=True, help='BAM file containing reads to be counted') ap.add_argument('--stranded', action='store_true', help='Use strand-specific merging and overlap. ' 'Default is to ignore strand') ap.add_argument('--no-parallel', dest='noparallel', action='store_true', help='Disables parallel computation') ap.add_argument('-o', '--output', help='Optional file to which results will be written; ' 'default is stdout') ap.add_argument('-v', '--verbose', action='store_true', help='Verbose (goes to stderr)') args = ap.parse_args() gff = args.gff bam = args.bam stranded = args.stranded parallel = not args.noparallel # Some GFF files have invalid entries -- like chromosomes with negative # coords or features of length = 0. This line removes them and saves the # result in a tempfile g = pybedtools.BedTool(gff).remove_invalid().saveas() # Decide which version of map to use. If parallel, we only need 3 # processes. pool = multiprocessing.Pool(processes=3) # Get separate files for introns and exons in parallel (if specified) featuretypes = ('intron', 'exon') introns, exons = pool.map(subset_featuretypes, featuretypes) # Perform some genome algebra to get unique and shared regions exon_only = exons.subtract(introns).merge().remove_invalid().saveas() intron_only = introns.subtract(exons).merge().remove_invalid().saveas() intron_and_exon = exons\ .intersect(introns).merge().remove_invalid().saveas() # Do intersections with BAM file in parallel features = (exon_only, intron_only, intron_and_exon) results = pool.map(count_reads_in_features, features) labels = (' exon only:', ' intron only:', 'intron and exon:') for label, reads in zip(labels, results): print('%s %s' % (label, reads)) pybedtools.cleanup(verbose=False)
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=SPADES_MAX_INTERVAL_SIZE, timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, svs_to_assemble=SVS_ASSEMBLY_SUPPORTED, stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS): pybedtools.set_tempdir(work) logger.info("Running SPAdes on the intervals in %s" % bed) if not bed: logger.info("No BED file specified") return None, None bedtool = pybedtools.BedTool(bed) total = bedtool.count() chrs = set(chrs) all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if interval.chrom in chrs] selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble), all_intervals) ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble), all_intervals) pool = multiprocessing.Pool(nthreads) assembly_fastas = [] for i in xrange(nthreads): intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i] kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad, "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail, "max_read_pairs": max_read_pairs} pool.apply_async(run_spades_single, kwds=kwargs_dict, callback=partial(run_spades_single_callback, result_list=assembly_fastas)) pool.close() pool.join() logger.info("Merging the contigs from %s" % (str(assembly_fastas))) assembled_fasta = os.path.join(work, "spades_assembled.fa") with open(assembled_fasta, "w") as assembled_fd: for line in fileinput.input(assembly_fastas): assembled_fd.write("%s\n" % (line.strip())) if os.path.getsize(assembled_fasta) > 0: logger.info("Indexing the assemblies") pysam.faidx(assembled_fasta) else: logger.error("No assembly generated") assembled_fasta = None ignored_bed = None if ignored_intervals: ignored_bed = os.path.join(work, "ignored.bed") pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed) pybedtools.cleanup(remove_all=True) return assembled_fasta, ignored_bed
def get_sequence(reference_fasta, coordinates, strand): """Takes coordinates and returns sequence bed_coor is space separated""" bed_coor = pybedtools.BedTool(coordinates, from_string=True) fasta = pybedtools.example_filename(reference_fasta) seq = bed_coor.sequence(fi=fasta) seq_str = open(seq.seqfn, 'r').read() pybedtools.cleanup(remove_all=True) return seq_str.replace('>', '').split('\n')[0:-1]
def getMatching(table, name, exonBed, intermediate_file_dir): table.to_csv(os.path.join(intermediate_file_dir, 'junc.bed'), sep='\t', index=None, header=False) bed = pybedtools.example_bedtool(os.path.join(intermediate_file_dir, 'junc.bed')) OverlapExon = bed.intersect(exonBed, wb=True) OverlapExon.saveas(os.path.join(intermediate_file_dir, 'OverlapExon.tsv')) matching = pd.read_csv(os.path.join(intermediate_file_dir, 'OverlapExon.tsv'), sep='\t', names=[name+'_chr', name+'_start', name+'_end', 'juncID', name+'_strand', name+'_fb', name+'gene_chromosome', name+'gene_start', name+'gene_end', name+'gene_strand', name+'_gene_id',name+'_gene', name+'gene_type']) pybedtools.cleanup(remove_all=True) return matching
def bootstrapRandom(num): #print(num) global file1, file2, genome_chrom_sizes shuffled_file1 = file1.shuffle(g=genome_chrom_sizes, chrom=True) shuffled_file2 = file2.shuffle(g=genome_chrom_sizes, chrom=True) f1_sorted = shuffled_file1.sort() f2_sorted = shuffled_file2.sort() shuffled_result = f1_sorted.jaccard(f2_sorted) pybedtools.cleanup() return shuffled_result['jaccard']
def main(): args = argparser() forwardsites = findSites(args.input, args.reference, "+") revsites = findSites(args.input, args.reference, "-") try: with open(args.output,'wb') as of: for l in forwardsites + revsites: of.write(l + "\n") except TypeError: for l in forwardsites + revsites: args.output.write(l + "\n") pybedtools.cleanup(remove_all=True)
def main(): args = argparser() forwardsites = findSites(args.input, args.reference, "+") revsites = findSites(args.input, args.reference, "-") try: with open(args.output, 'wb') as of: for l in forwardsites + revsites: of.write(l + "\n") except TypeError: for l in forwardsites + revsites: args.output.write(l + "\n") pybedtools.cleanup(remove_all=True)
def on_epoch_end(self, epoch, logs=None): """ monitor PR """ x_val, y_val = self.validation_data[0], self.validation_data[1] predictions = self.model.predict(x_val) au_prc = average_precision_score(y_val, predictions) print("\nau-PRC:", au_prc) self.val_auprc.append(au_prc) # Tmp bedfiles taking up huge amount of disk space. # Cleaning up after every 10 epochs. print(epoch) if (epoch+1) % 5 == 0: pybedtools.cleanup(verbose=0)
def preprocess_bam_to_bed(bam, output): ''' Given local bam file, convert reads to set of 101bp intervals and output as bed file. Filter for reads thats are ''' # convert bam to bed vprint("Converting bam to bed...") bam = BedTool(bam) bed = bam.bam_to_bed() # filter intervals vprint("Filter reads by size...") bed_chunk_iter = bed.to_dataframe(chunksize=10000000) # chunk large file chunks = [] for chunk in bed_chunk_iter: keep = ( chunk[["start", "end"]] .swifter.progress_bar(enable=True, desc=bam) .apply(lambda row: is_valid_interval(row["start"], row["end"]), axis=1) ) chunks.append(chunk[keep]) bed_df = pd.concat(chunks) # 101bp interval for input vprint("Define 101bp intervals...") bed_df["end"] = ( bed_df["start"].swifter.progress_bar( enable=True).apply(define_interval) ) bed_df["name"] = "-" # remove duplicates vprint("Drop duplicate intervals...") bed_df.drop_duplicates(inplace=True) # TODO extraneous chromosomes? vprint("Remove extra chromosomes...") chromosomes = list(range(1, 23)) chromosomes.append('X') chromosomes.append('Y') chromosomes = [f'chr{c}' for c in chromosomes] bed_df = bed_df.loc[bed_df['chrom'].isin(chromosomes)] # Save result vprint(f"Saving {bed_df.shape[0]} intervals...") BedTool.from_dataframe(bed_df).moveto(output) # cleanup tmp files pybedtools.cleanup(remove_all=True) vprint("Done.")
def find_variant_fragments(query_fp, fragment_fp, output_fp, db_url, table): """ Create table of variant fragment IDs. """ fragment_bed = pybedtools.BedTool(fragment_fp) desc = 'Finding variant fragments...' bar_format = '{desc}: {n_fmt} {unit}' t = tqdm(total=0, unit='variants', desc=desc, disable=False, bar_format=bar_format) chunksize = 200000 cols = ['chr', 'frag_id', 'id', 'variant_id'] pybedtools.set_tempdir(os.path.dirname(output_fp)) db = create_engine(db_url, echo=False) idx = 1 for query_df in pd.read_csv(query_fp, sep='\t', compression='gzip', chunksize=chunksize, usecols=['chr', 'variant_pos', 'variant_id']): query_df = query_df.rename(columns={'variant_pos': 'end'}) query_df['id'] = range(idx, idx + len(query_df)) query_df['start'] = query_df['end'] - 1 query_bed = pybedtools.BedTool.from_dataframe( query_df[['chr', 'start', 'end', 'id', 'variant_id']]) df = fragment_bed.intersect(query_bed, wb=True) df = df.to_dataframe(names=[ 'frag_chr', 'frag_start', 'frag_end', 'frag_id', 'chrom', 'start', 'end', 'id', 'variant_id' ]) cols = ['frag_id', 'chrom', 'id'] mode = 'w' if t.total == 0 else 'a' header = 'True' if t.total == 0 else None df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode) if table: if_exists = 'replace' if t.total == 0 else 'append' df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) idx += len(query_df) t.total += len(query_df) t.update(len(query_df)) t.close() pybedtools.cleanup(remove_all=True) if not table: return create_index(table, db)
def find_gene_fragments(query_fp, fragment_fp, output_fp, db_url, table): """ Create table of gene fragment IDs. """ fragment_bed = pybedtools.BedTool(fragment_fp) desc = 'Finding gene fragments...' bar_format = '{desc}: {n_fmt} {unit}' t = tqdm(total=0, unit='genes', desc=desc, disable=False, bar_format=bar_format) chunksize = 2000 pybedtools.set_tempdir(os.path.dirname(output_fp)) db = create_engine(db_url, echo=False) for query_df in pd.read_csv(query_fp, sep='\t', compression='infer', chunksize=chunksize): #query_df.columns = ['id', 'chr', 'start', 'end', 'gene', 'gencode_id'] query_bed = pybedtools.BedTool.from_dataframe( query_df[['chrom', 'start', 'end', 'id', 'gencode_id']]) # print(query_bed) df = fragment_bed.intersect(query_bed, wb=True) df = df.to_dataframe(names=[ 'frag_chr', 'frag_start', 'frag_end', 'frag_id', 'chrom', 'start', 'end', 'id', 'gencode_id' ]) cols = ['frag_id', 'chrom', 'id'] mode = 'w' if t.total == 0 else 'a' header = 'True' if t.total == 0 else None df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode, index=False) if table: if_exists = 'replace' if t.total == 0 else 'append' df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) t.total = len(query_df) t.update(len(query_df)) t.close() pybedtools.cleanup(remove_all=True) if not table: return create_index(table, db)
def retrieve_peaks(peak_file, peak_kwd, group_name, seq_dict): peaks = pybedtools.BedTool(peak_file) num_files = len(seq_dict[peak_kwd][group_name].keys()) print_time("{} batches to process".format(num_files), start_time) for file in seq_dict[peak_kwd][group_name].keys(): if __check_exist(file): print_time("{} exists -- skip".format(file), start_time) continue # Initialize output signal. sample_index, start, stop = __file_attribute( seq_dict[peak_kwd][group_name][file]) signal = np.empty((stop - start, args.sequence_length + 3)) signal[:] = np.NaN signal[:, -3:-1] = sample_index # Find peaks that overlap with each sample sequence. for k in tqdm(range(start, stop)): ks = k - start signal[ks, -1] = k signal[ks, :args.sequence_length] = 0 sample = Sample(seq_dict["input"][k]) entry = "{} {} {}".format(sample.chrom, sample.start, sample.stop) a = pybedtools.BedTool(entry, from_string=True) apeaks = a.intersect(peaks) for p in apeaks: s = p.start - sample.start t = p.stop - sample.start signal[ks, s:t] = 1 if (k + 1) % 1000 == 0: pybedtools.cleanup(remove_all=True) # Save batch data file to disk. np.savez_compressed( file, group_name=group_name, peak_type=peak_kwd, start=start, stop=stop, data=signal, ) print_time("{} targets saved in {}".format(peak_kwd, file), start_time)
def retrieve_signal( peak_file, bigWig_file, seq_dict, tmp_file, group_name, assay_type ): peaks = pybedtools.BedTool(peak_file) num_samples = len(seq_dict["input"]) writer = FeatureWriter( args.batch_size, args.sequence_length, num_samples, group_name, assay_type, ) for k in tqdm(seq_dict["input"].keys()): # Initialize signal track. signal = np.zeros(args.sequence_length) # Construct BedTool input from sample sequence location. sample = Sample(seq_dict["input"][k]) entry = "{} {} {}".format(sample.chrom, sample.start, sample.stop) a = pybedtools.BedTool(entry, from_string=True) # Retrieve sample bigwig signal that fall within peak regions. apeaks = a.intersect(peaks) for p in apeaks: cmd = "bigWigToBedGraph -chrom={} -start={} -end={} {} {}".format( sample.chrom, p.start, p.stop, bigWig_file, tmp_file ) check_call(cmd, shell=True) with open(tmp_file, "rb") as wigs: for line in wigs: record = line.strip().decode("utf-8").split("\t") s = int(record[1]) - sample.start t = int(record[2]) - sample.start signal[s:t] = float(record[3]) # Write signal track to disk. writer.write_feature( signal, k, seq_dict["input"][k][assay_type][group_name] ) # Clean up tmp files generated by pybedtools. if (k + 1) % 1000 == 0: pybedtools.cleanup(remove_all=True) return writer
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=50000, timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, disable_deletion_assembly=False, stop_on_fail=False): pybedtools.set_tempdir(work) bedtool = pybedtools.BedTool(bed) total = bedtool.count() chrs = set(chrs) all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if interval.chrom in chrs] selected_intervals = filter(partial(should_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals) ignored_intervals = filter(partial(shouldnt_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals) pool = multiprocessing.Pool(nthreads) assembly_fastas = [] for i in xrange(nthreads): intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i] kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad, "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail} pool.apply_async(run_spades_single, kwds=kwargs_dict, callback=partial(run_spades_single_callback, result_list=assembly_fastas)) pool.close() pool.join() logger.info("Merging the contigs from %s" % (str(assembly_fastas))) assembled_fasta = os.path.join(work, "spades_assembled.fa") with open(assembled_fasta, "w") as assembled_fd: for line in fileinput.input(assembly_fastas): assembled_fd.write("%s\n" % (line.strip())) logger.info("Indexing the assemblies") pysam.faidx(assembled_fasta) ignored_bed = None if ignored_intervals: ignored_bed = os.path.join(work, "ignored.bed") pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed) pybedtools.cleanup(remove_all=True) return assembled_fasta, ignored_bed
def GetRatioGenome(): pulldown = pysam.Samfile(args.pulldown) control = pysam.Samfile(args.control) if args.tot_pulldown is not None: tot_pulldown = args.tot_pulldown tot_control = args.control else: tot_pulldown = pulldown.mapped + pulldown.unmapped tot_control = control.mapped + control.unmapped print >>sys.stderr, "Total number of reads in pulldown sample: %d" % (tot_pulldown) print >>sys.stderr, "Total number of reads in control sample: %d" % (tot_control) # get spike-in read within bed file pulldown_bam = pybedtools.BedTool(args.pulldown) control_bam = pybedtools.BedTool(args.control) spike_pulldown = pulldown_bam.intersect(args.pos,f=0.5).count() spike_control = control_bam.intersect(args.pos,f=0.5).count() print >>sys.stderr, "Total number of reads mapped in spike-in in pulldown sample: %d" % (spike_pulldown) print >>sys.stderr, "Total number of reads mapped in spike-in in control sample: %d" % (spike_control) ratio = float(spike_control)/float(spike_pulldown)*float(tot_pulldown)/float(tot_control) print >>sys.stderr, "Ratio is %.6f" % (ratio) pulldown.close() control.close() pybedtools.cleanup()
def vcf_to_df_worker(arg): """ Convert CANVAS vcf to a dict, single thread """ canvasvcf, exonbed, i = arg logging.debug("Working on job {}: {}".format(i, canvasvcf)) samplekey = op.basename(canvasvcf).split(".")[0].rsplit('_', 1)[0] d = {'SampleKey': samplekey} exons = BedTool(exonbed) cn = parse_segments(canvasvcf) overlaps = exons.intersect(cn, wao=True) gcn_store = {} for ov in overlaps: # Example of ov.fields: # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5', # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene', # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0'] gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5])) if gene_name not in gcn_store: gcn_store[gene_name] = defaultdict(int) cn = ov.fields[-2] if cn == ".": continue cn = int(cn) if cn > 10: cn = 10 amt = int(ov.fields[-1]) gcn_store[gene_name][cn] += amt for k, v in sorted(gcn_store.items()): v_mean, v_median = counter_mean_and_median(v) d[k + ".avgcn"] = v_mean d[k + ".medcn"] = v_median cleanup() return d
def bin_frag(outdir, bin_bed, all_frag_bed, fragcount_bed): logging.info("output directory: %s", outdir) logging.info("Bin file: %s", bin_bed) logging.info("All restriction fragments bed file: %s", all_frag_bed) logging.info("Number of fragdata files: %d", len(fragcount_bed)) os.mkdir(outdir) # open bins file bins = pbt.BedTool(bin_bed) logging.info("read in %8d bins", len(bins)) # open all frag file all_frag = pbt.BedTool(all_frag_bed) logging.info("read in %8d restriction fragments", len(all_frag)) # match up bins with restriction fragments #TODO: stats on the result bins_with_any_frag = count_frags_per_bin(bins, all_frag) logging.info("bins that contained any fragments: %d", len(bins_with_any_frag)) make_bedgraph_files(fragcount_bed, bins_with_any_frag, outdir) # cleanup pbt.cleanup(remove_all = True)
def extract_target_genes_transcripts(dicoNiourk): if dicoNiourk["target"]!="": print("\x1b[0;38;2;"+dicoNiourk["color"]["light1"]+"m") ; sys.stdout.write("\033[F") dicoNiourk["spinner"].text = " • Extract target genes" dicoNiourk["spinner"].start() # Find intersecation between gff and target bed bed = BedTool(dicoNiourk["target"]) genes = BedTool(dicoNiourk["refseq_gff"]) dicoNiourk["target_gene"] = {} # id to name dicoNiourk["target_transcript"] = {} # id to name dico_intersect_transcript = {} # Search gff exons intresection for intersect_elem in genes+bed: if intersect_elem.fields[2]=="exon": exon = dicoNiourk["db_gff"][intersect_elem.attrs["ID"]] # retrieve correspunding transcript for rna in dicoNiourk["db_gff"].parents(exon, featuretype='mRNA', order_by='start'): try: dico_intersect_transcript[rna]+=1 except: dico_intersect_transcript[rna] = 1 #***** FILTER transcript which all coding exons in target *****# for rna in dico_intersect_transcript: # retrieve parent gene gene = list(dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'))[0] cds_start = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[0].start cds_end = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[-1].end # Count coding exon nb_coding_exons = 0 for exon in dicoNiourk["db_gff"].children(rna, featuretype='exon', order_by='start'): if exon.end>cds_start and exon.start<cds_end: nb_coding_exons+=1 # Filtre transcripts and genes if dico_intersect_transcript[rna]>=nb_coding_exons: dicoNiourk["target_transcript"][rna.attributes["Name"][0]] = rna.id for gene in dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'): dicoNiourk["target_gene"][gene.attributes["Name"][0]] = gene.id dicoNiourk["spinner"].stop() printcolor(" • "+str(len(dicoNiourk["target_gene"]))+"genes/"+str(len(dicoNiourk["target_transcript"]))+"rnas extracted\n","0",dicoNiourk["color"]["light1"],None,dicoNiourk["color"]["bool"]) cleanup(remove_all=True) # delete created temp file
def Jaccard_stats(bed_fname1, bed_fname2, genome): """Compute Jaccard index. Parameters ---------- bed_fname1 : string Name of file with CNV calls from experiment 1 in BED format. bed_fname2 : string Name of file with CNV calls from experiment 2 in BED format. genome : string Name of genome file. Returns ------- jacc_idx : float Jaccard index. """ exp1 = pybedtools.BedTool(bed_fname1).merge() exp2 = pybedtools.BedTool(bed_fname2).merge() res = exp1.jaccard(exp2, sorted=True, g=genome) pybedtools.cleanup(remove_all=True) jacc_idx = res['jaccard'] return jacc_idx
def MultiThreadRun(index, iboolDict, args, kwargs): if iboolDict['bam']: tempFile = tempfile.NamedTemporaryFile(suffix='.tmp', prefix='pybedtools.tempfile', dir=args.temp, delete=True) if iboolDict['both']: peakBed = pybedtools.BedTool(args.bed[index]).sort() bamFile = args.bam[index] inputBedDict = BamToBed(bamFile, peakBed, tempFile.name, args) elif iboolDict['bed']: inputBedDict = RebuildBed(args.bed[index], args.method, args.extend) else: peakBed = None bamFile = args.bam[index] inputBedDict = BamToBed(bamFile, peakBed, tempFile.name, args) ## retrieve bin-value relationships sampleName = args.name[index] binValDict = RunMetagene(inputBedDict, args, kwargs) ## Deletes all temp files from the current session pybedtools.cleanup(verbose=False, remove_all=False) if iboolDict['bam']: tempFile.close() return [sampleName, binValDict]
def parallel_generate_sc_intervals(bams, chromosomes, skip_bed, workdir, num_threads=1, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, pad=SC_PAD, min_support_ins=MIN_SUPPORT_INS, min_support_frac_ins=MIN_SUPPORT_FRAC_INS, max_intervals=MAX_INTERVALS, max_nm=SC_MAX_NM, min_matches=SC_MIN_MATCHES, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD, svs_to_softclip=SVS_SOFTCLIP_SUPPORTED, overlap_ratio=OVERLAP_RATIO, mean_read_length=MEAN_READ_LENGTH, mean_read_coverage=MEAN_READ_COVERAGE, min_ins_cov_frac=MIN_INS_COVERAGE_FRAC, max_ins_cov_frac=MAX_INS_COVERAGE_FRAC): func_logger = logging.getLogger( "%s-%s" % (parallel_generate_sc_intervals.__name__, multiprocessing.current_process())) if not os.path.isdir(workdir): func_logger.info("Creating directory %s" % workdir) os.makedirs(workdir) if not chromosomes: func_logger.info("Chromosome list unspecified. Inferring from the BAMs") for bam in bams: bamfile = pysam.Samfile(bam, "rb") chromosomes += list(bamfile.references) bamfile.close() chromosomes = sorted(list(set(chromosomes))) func_logger.info("Chromosome list inferred as %s" % (str(chromosomes))) if not chromosomes: func_logger.error("Chromosome list empty") return None merge_max_dist = -int(1 * pad) func_logger.info("SVs to soft-clip: %s" % (svs_to_softclip)) pool = multiprocessing.Pool(num_threads) bed_files = [] for index, (bam, chromosome) in enumerate(itertools.product(bams, chromosomes)): process_workdir = os.path.join(workdir, str(index)) if not os.path.isdir(process_workdir): os.makedirs(process_workdir) args_list = [bam, chromosome, process_workdir] kwargs_dict = {"min_avg_base_qual": min_avg_base_qual, "min_mapq": min_mapq, "min_soft_clip": min_soft_clip, "pad": pad, "min_support_ins": min_support_ins, "min_support_frac_ins": min_support_frac_ins, "max_nm": max_nm, "min_matches": min_matches, "isize_mean": isize_mean, "isize_sd": isize_sd, "svs_to_softclip": svs_to_softclip, "merge_max_dist": merge_max_dist, "mean_read_length": mean_read_length, "mean_read_coverage": mean_read_coverage, "min_ins_cov_frac": min_ins_cov_frac, "max_ins_cov_frac": max_ins_cov_frac} pool.apply_async(generate_sc_intervals, args=args_list, kwds=kwargs_dict, callback=partial(generate_sc_intervals_callback, result_list=bed_files)) pool.close() pool.join() # Remove empty BED files, which can cause merging issues with pybedtools bed_files = [bed_file for bed_file in bed_files if os.path.exists(bed_file) and os.path.getsize(bed_file) > 0] func_logger.info("Following BED files will be merged: %s" % (str(bed_files))) if not bed_files: func_logger.warn("No intervals generated") return None pybedtools.set_tempdir(workdir) bedtool = pybedtools.BedTool(bed_files[0]) for bed_file in bed_files[1:]: bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False) bedtool = bedtool.sort().moveto(os.path.join(workdir, "all_intervals.bed")) func_logger.info("Selecting the top %d intervals based on normalized read support" % max_intervals) top_intervals_all_cols_file = os.path.join(workdir, "top_intervals_all_cols.bed") if bedtool.count() <= max_intervals: bedtool = bedtool.saveas(top_intervals_all_cols_file) else: # Sample the top intervals top_fraction_cutoff = \ sorted([find_coverage_frac(interval.fields[7], interval.fields[6]) for interval in bedtool], reverse=True)[ max_intervals - 1] func_logger.info("Normalized read support threshold: %0.3f" % top_fraction_cutoff) bedtool = bedtool.filter(lambda x: find_coverage_frac(x.fields[7],x.fields[6]) >= top_fraction_cutoff).moveto( top_intervals_all_cols_file) # Filter out the extra column added to simplify life later on bedtool = bedtool.cut(xrange(6)).saveas(os.path.join(workdir, "top_intervals.bed")) interval_bed = os.path.join(workdir, "intervals.bed") if skip_bed: skip_bedtool = pybedtools.BedTool(skip_bed) sc_skip_bed = os.path.join(workdir, "sc_metasv.bed") if "INS" in svs_to_softclip: skip_bedtool = skip_bedtool.each(partial(add_INS_padding,pad=pad)).saveas(sc_skip_bed) nonsc_skip_bed = os.path.join(workdir, "non_sc_metasv.bed") func_logger.info( "Merging %d features with %d features from %s" % (bedtool.count(), skip_bedtool.count(), skip_bed)) nonsc_skip_bedtool = skip_bedtool.filter(lambda x: x.name.split(',')[1] not in svs_to_softclip).saveas(nonsc_skip_bed) sc_skip_bedtool = skip_bedtool.filter(lambda x: x.name.split(',')[1] in svs_to_softclip).saveas(interval_bed) if len(sc_skip_bedtool) > 0: bedtool = bedtool.cat(sc_skip_bedtool, postmerge=False) bedtool = bedtool.sort() bedtool = merge_for_each_sv(bedtool,c="4",o="collapse",svs_to_softclip=svs_to_softclip, overlap_ratio=overlap_ratio, reciprocal_for_2bp=True, d=merge_max_dist) bedtool = bedtool.each(partial(fix_merged_fields,inter_tools=True)).sort().moveto(interval_bed) if len(nonsc_skip_bedtool) > 0: bedtool = bedtool.cat(nonsc_skip_bedtool, postmerge=False).sort().moveto(interval_bed) func_logger.info("After merging with %s %d features" % (skip_bed, bedtool.count())) else: bedtool = bedtool.saveas(interval_bed) pybedtools.cleanup(remove_all=True) return bedtool.fn
def main(): opt_parser = ArgumentParser( description="Annotate genomic intervals with RefSeq or Ensembl databases.", prog="region_analysis.py") opt_parser.add_argument('-i', '--input', action='store', help='Input region file must assume the first 3 columns contain (chr, start, end)') opt_parser.add_argument('-d', '--database', action='store', help='Choose database: refseq(default) or ensembl', default='refseq') opt_parser.add_argument('-r', '--rhead', action='store_true', help='Whether the input file contains column header', default=False) opt_parser.add_argument('-g', '--genome', action='store', help='Choose genome: mm10(default)', default='mm10') opt_parser.add_argument('-rv', '--RAver', action='store', help='Version of Region Analysis databases, default is the newest', default=None) opt_parser.add_argument('-v', '--version', action='store_true', help='Version of Region_Analysis package') options = opt_parser.parse_args() if options.version: sys.stdout.write("Region_Analysis Version: %s\n" % regionanalysis.packageinfo.__version__) opt_parser.print_help() return 0 module_dir = os.path.dirname(os.path.realpath(regionanalysis.__file__)) # db_path = os.path.join(module_dir, "database/") input_file_name = options.input anno_db = options.database rhead = options.rhead genome = options.genome rv = options.RAver if (input_file_name is None) or (len(input_file_name) == 0): opt_parser.error( "Please assign proper input file!\n--help will show the help information.") genome_info = regionanalysis.annotationdb.getAnnoDBPath( module_dir, genome, anno_db, rv) try: if genome_info is None: raise SystemExit db_path = genome_info["path"] except SystemExit: if rv is None: sys.stderr.write("%s not in the genome database!\n" % genome) return 1 else: sys.stderr.write("%s, RAver %s not in the genome database!\n" % (genome, rv)) return 1 # create a tmp bed file with index column. in_f = file(input_file_name) # filter the comment lines input_filtered = [ line for line in in_f if not (line.lstrip().startswith("#") or len(line.strip())==0)] # if there is header, store it and remove it from the query BED. if rhead: headlineL = input_filtered[0].strip().split("\t") del input_filtered[0] # add index column to the bed lines input_indexed = ['%s\t%d\n' % (line.strip(), i) for i, line in enumerate(input_filtered)] in_f.close() # read all annotations into a dictionary, for the further output. anno_bed = os.path.join( db_path, genome + "." + anno_db + ".biotype_region_ext.bed") try: if not os.path.exists(anno_bed): raise SystemExit except SystemExit: sys.stderr.write("%s genome not properly installed!\n" % genome) return 1 # use saveas() to convert the BedTool objects to file-based objects, # so they could be used multiple times. # When debug, we may use saveas("tss.tmp"), and the output of bedtools # could be saved. pybedtools.set_tempdir("./") anno = pybedtools.BedTool(anno_bed).saveas() gd = pybedtools.BedTool( os.path.join(db_path, genome + "_geneDesert.bed")).saveas() pc = pybedtools.BedTool( os.path.join(db_path, genome + "_pericentromere.bed")).saveas() st = pybedtools.BedTool( os.path.join(db_path, genome + "_subtelomere.bed")).saveas() # load the input intervals to be annotated. try: input_bed = pybedtools.BedTool( "".join(input_indexed), from_string=True).saveas() except: sys.stderr.write("Error in input file! Please check the format!") return 1 list_input = [x.fields[:] for x in input_bed] col_no_input = input_bed.field_count() # get the midpoint of the intervals. # there is a bug in midpoint function of pybedtools 0.6.3, so here an alternative function was used. # input_bed_mid = input_bed.each(pybedtools.featurefuncs.midpoint).saveas() input_bed_mid = pybedtools.BedTool( "".join([regionanalysis.analysis.midpoint(x) for x in input_indexed]), from_string=True).saveas() # intersectBed with annotations. input_GB = input_bed_mid.intersect(anno, wao=True).saveas() list_GB = [x.fields[:] for x in input_GB] input_gd = input_bed_mid.intersect(gd, c=True, f=0.5).saveas() list_gd = [x.fields[col_no_input + 0] for x in input_gd] input_pc = input_bed_mid.intersect(pc, c=True, f=0.5).saveas() list_pc = [x.fields[col_no_input + 0] for x in input_pc] input_st = input_bed_mid.intersect(st, c=True, f=0.5).saveas() list_st = [x.fields[col_no_input + 0] for x in input_st] # groupby the intersectBed results based on the index column. input_idx = key = lambda s: s[col_no_input - 1] GB_dict = {} for key, GB_hits in groupby(list_GB, key=input_idx): GB_dict[key] = list(v for v in GB_hits) output_file_best = file(input_file_name + ".annotated", "w") output_file = file(input_file_name + ".full.annotated", "w") output_file_json = file(input_file_name + ".full.annotated.json", "w") # Output the header. if rhead: output_file.write("\t".join( headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n") output_file_best.write("\t".join( headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n") # write to the output: input.bed.annotated, input.bed.full.annotated. json_dict = {} for i in range(0, len(input_bed)): output_lineL = list_input[i][:-1] # original input line json_dict[str(i)] = {} json_dict[str(i)]["query_interval"] = output_lineL formatted, best_hit = regionanalysis.analysis.getBestHit( anno_db, col_no_input, GB_dict[str(i)], list_gd[i], list_st[i], list_pc[i]) output_file_best.write("\t".join(output_lineL + best_hit) + "\n") json_dict[str(i)]["best_hit"] = best_hit for j in formatted: output_file.write("\t".join(output_lineL + j) + "\n") json_dict[str(i)]["all_hits"] = formatted output_file_best.close() output_file.close() json.dump(json_dict, output_file_json, sort_keys=True, indent=2) output_file_json.close() pybedtools.cleanup() return 0
def teardown(): pybedtools.cleanup(remove_all=True)
def main(): parser=OptionParser() parser.add_option('-i', '--inputBAM', dest='inputBAM', help='Aligned BAM from zUMI filtering+mapping steps with cell barcode and umi barcode correction.') parser.add_option('-c', '--config', dest='config', help='A configuration file for required files and parameters.') parser.add_option('-e', '--experiment', dest='experiment', help='Experiment name.') parser.add_option('-o', '--outputDir', dest='outputDir', default='ss3rnaseq', help='The output directory for the experiment.') parser.add_option('-p', '--process', dest='process', default=8, help='The number of processes for parallel computing.') parser.add_option('-s', '--species', dest='species', default='hg38', help='The species under study.') parser.add_option("-P", "--Preprocess", action="store_true", dest='preprocess', help="Preprocess the input BAM for downstream analysis.") parser.add_option("-Q", "--Quantification", action="store_true", dest='quantification', help="Run isoform reconstruction and quantification.") (op, args) = parser.parse_args() inputBAM = op.inputBAM conf = op.config experiment = op.experiment outdir = op.outputDir nprocess = int(op.process) if op.species == 'hg38' or op.species == 'hg19': species = 'hsa' elif op.species == 'mm9' or op.species == 'mm10': species = 'mmu' config = configparser.ConfigParser() config.read(conf) conf_data = config._sections if not os.path.exists(outdir): os.makedirs(outdir) if not os.path.exists('%s/%s' %(outdir, species)): os.makedirs('%s/%s' %(outdir, species)) if not os.path.exists('%s/%s/%s' %(outdir, species, experiment)): os.makedirs('%s/%s/%s' %(outdir, species, experiment)) umi_file_prefix = 'UBfix.sort.bam' if op.preprocess: print('Preprocessing on input BAM ...') preDir = os.path.join(outdir, species, experiment, "preprocess") if not os.path.exists(preDir): os.makedirs(preDir) cmd = 'samtools sort -m %s -O bam -@ %s -o %s/%s %s' %(conf_data['preprocess']['memory'], nprocess, preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), inputBAM) os.system(cmd) cmd = 'samtools view -b -q 255 %s/%s > %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM))) os.system(cmd) cmd = 'samtools view -h %s/%s | awk \'$12 != "NH:i:1"\' | samtools view -bS - > %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM))) os.system(cmd) os.system('samtools index %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM)))) os.system('samtools index %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM)))) if op.quantification: print('Collect informative reads per gene...') in_bam_uniq = '%s/%s' %(os.path.join(outdir, species, experiment, "preprocess"), re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM))) in_bam_multi = '%s/%s' %(os.path.join(outdir, species, experiment, "preprocess"), re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM))) out_path = os.path.join(outdir, species, experiment, "expression_%s" %(conf_data['annotation']['gtf_source'])) if not os.path.exists(out_path): os.makedirs(out_path) sys_tmp_dir = '%s/.tmp' %(out_path) if not os.path.exists(sys_tmp_dir): os.makedirs(sys_tmp_dir) pybedtools.set_tempdir(sys_tmp_dir) pybedtools.cleanup(remove_all=True) fetch_gene_reads(in_bam_uniq, in_bam_multi, conf_data, op.species, out_path) print('Build reference isoforms...') ref = build_reference(conf_data, out_path) print('Start isoform reconstruction and quantification...') get_isoforms(conf_data, out_path, ref)
def generate_sc_intervals(bam, chromosome, workdir, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD, min_support=MIN_SUPPORT, max_isize=1000000000, min_support_frac=MIN_SUPPORT_FRAC): func_logger = logging.getLogger("%s-%s" % (generate_sc_intervals.__name__, multiprocessing.current_process())) if not os.path.isdir(workdir): func_logger.error("Working directory %s doesn't exist" % workdir) return None func_logger.info("Generating candidate insertion intervals from %s for chromsome %s" % (bam, chromosome)) pybedtools.set_tempdir(workdir) unmerged_intervals = [] start_time = time.time() try: sam_file = pysam.Samfile(bam, "rb") for aln in sam_file.fetch(reference=chromosome): if abs(aln.tlen) > max_isize: continue if not is_good_candidate(aln, min_avg_base_qual=min_avg_base_qual, min_mapq=min_mapq, min_soft_clip=min_soft_clip, max_soft_clip=max_soft_clip): continue interval = get_interval(aln, pad=pad) soft_clip_location = sum(interval) / 2 strand = "-" if aln.is_reverse else "+" name = "%d,%s" % (soft_clip_location, strand) unmerged_intervals.append( pybedtools.Interval(chromosome, interval[0], interval[1], name=name, score="1", strand=strand)) if not unmerged_intervals: sam_file.close() func_logger.warn("No intervals generated") return None unmerged_bed = os.path.join(workdir, "unmerged.bed") bedtool = pybedtools.BedTool(unmerged_intervals).sort().moveto(unmerged_bed) func_logger.info("%d candidate reads" % (bedtool.count())) merged_bed = os.path.join(workdir, "merged.bed") bedtool = bedtool.merge(c="4,5", o="collapse,sum", d=-500).moveto(merged_bed) func_logger.info("%d merged intervals" % (bedtool.count())) filtered_bed = os.path.join(workdir, "filtered_merged.bed") bedtool = bedtool.filter(lambda x: int(x.score) >= min_support).each(partial(merged_interval_features, bam_handle=sam_file)).moveto( filtered_bed) func_logger.info("%d filtered intervals" % (bedtool.count())) # Now filter based on coverage coverage_filtered_bed = os.path.join(workdir, "coverage_filtered_merged.bed") bedtool = bedtool.filter(lambda x: float(x.fields[6]) * min_support_frac <= float(x.score)).moveto(coverage_filtered_bed) func_logger.info("%d coverage filtered intervals" % (bedtool.count())) sam_file.close() except Exception as e: func_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e pybedtools.cleanup(remove_all=True) func_logger.info("Generated intervals in %g seconds" % (time.time() - start_time)) return coverage_filtered_bed
import pybedtools pybedtools.set_tempdir('.') a = pybedtools.bedtool('a.bed') a.intersect(a) pybedtools.cleanup(verbose=True)
def teardown_module(): if os.path.exists(test_tempdir): os.system('rm -r %s' % test_tempdir) pybedtools.cleanup()
def main(): # READING ARGUMENTS parser = argparse.ArgumentParser( description='Arguments for Activity from Contacts') parser.add_argument('--enhancers', type=str, help='Enhancer BedFile with the activity reported') parser.add_argument('--tss', type=str, help='TSS BedFile') parser.add_argument('--promoters', type=str, help='activity for promoter regions') parser.add_argument( '--hic', type=str, help='Hi-C regularized counts', default='../data/external/K562_filteredRegularized_contactCount.tsv') parser.add_argument('--bincoord', type=str, help='Coordinates for bins', default='../data/external/K562_filteredBins.bed') parser.add_argument('--chain', type=str, help='Chain file for coordinate liftover', default='../data/external/hg38ToHg19.over.chain') parser.add_argument('--chromap', type=str, help='Chromosome mappping file', default='../data/external/GRCh37_UCSC2ensembl.txt') parser.add_argument('-p', type=int, help='Cores to use during processing', default=1) parser.add_argument('--scaler', type=int, help='Values to multiply for re-scaling', default=100) parser.add_argument('--closer', type=int, help='Cutoff for enhancer vecinity', default=5_000_000) parser.add_argument('--gamma', type=int, help='Gamma powerlaw parameter', default=-0.7764771175681618) parser.add_argument('--scaleparam', type=int, help='Scale powerlaw parameter', default=10.787505424121239) parser.add_argument('--mindist', type=int, help='Minimum distance for powerlaw', default=1_000_000) parser.add_argument('--promlength', type=int, help='Promoter length', default=500) parser.add_argument('--cutoff', type=int, help='Cutoff probability', default=0) parser.add_argument('--outfile', type=str, help='Output file name') args = parser.parse_args() # ASSIGNING ARGUMENTS TO VARIABLES enhancer_bedfile = args.enhancers tss_bedfile = args.tss promoters_bedfile = args.promoters hic_file = args.hic num_cores = args.p chromosome_mapping = args.chromap coord_conversion = args.chain filtered_coords = args.bincoord SCALER_VALUE = args.scaler CLOSER_CUTOFF = args.closer SCALE_PL = args.scaleparam GAMMA_PL = args.gamma DISTMIN_PL = args.mindist PROMOTER_LEN = args.promlength CUTOFF_POSITIVE = args.cutoff output = args.outfile ''' Reading file from input ''' print('Reading files...') tss_df = pd.read_csv( tss_bedfile, sep='\t', header=None, names=['chr', 'start', 'end', 'gene_id', 'score', 'strand']) promoters_df = pd.read_csv(promoters_bedfile, sep='\t') enhancer_df = pd.read_csv(enhancer_bedfile, sep='\t') # For some reason, the indexing is faster when read from csv. hic_counts = pd.read_csv(hic_file, sep='\t') filtered_bins = pybedtools.BedTool(filtered_coords) lift_file = LiftOver(coord_conversion) chromap_file = pd.read_csv(chromosome_mapping, sep='\t', header=None, names=['chromosome', 'ensembl_chr'], index_col='chromosome') try: enhancer_df = enhancer_process(enhancer_info=enhancer_df, filtered_bincoord=filtered_bins, coordinate_liftover=lift_file, chromosome_relation=chromap_file) tss_df = tss_process(tss_info=tss_df, filtered_bincoord=filtered_bins, coordinate_liftover=lift_file, chromosome_relation=chromap_file) tss_enhancer_intersected = intersect_elements( tss_intersect=tss_df, enhancer_intersect=enhancer_df, closer_value=CLOSER_CUTOFF) rescaled_data = rescale_rows(processed_df=tss_df, s_value=SCALER_VALUE, regularized_counts=hic_counts, num_p=num_cores) denom_dict, tss_enhancer_newinf = calculate_denominator( enhancer_tss_info=tss_enhancer_intersected, promoter_info=promoters_df, scaled_counts=rescaled_data, gamma_powerlaw=GAMMA_PL, scale_powerlaw=SCALE_PL, s_value=SCALER_VALUE, distance_min=DISTMIN_PL, promoter_length=PROMOTER_LEN, num_p=num_cores) calculate_abc(enhancer_tss_info=tss_enhancer_newinf, denominator_values=denom_dict, num_p=num_cores, positive_cutoff=CUTOFF_POSITIVE, output_name=output) finally: pybedtools.cleanup()
def teardown_module(): if os.path.exists(tempdir): os.system("rm -r %s" % tempdir) pybedtools.cleanup()
bed=True, stream=True).count() # Decide which version of map to use. If parallel, we only need 3 # processes. pool = multiprocessing.Pool(processes=3) # Get separate files for introns and exons in parallel (if specified) featuretypes = ('intron', 'exon') introns, exons = pool.map(subset_featuretypes, featuretypes) # Perform some genome algebra to get unique and shared regions exon_only = exons.subtract(introns).merge().remove_invalid().saveas() intron_only = introns.subtract(exons).merge().remove_invalid().saveas() intron_and_exon = exons.intersect(introns).merge().remove_invalid().saveas() # Do intersections with BAM file in parallel features = (exon_only, intron_only, intron_and_exon) results = pool.map(count_reads_in_features, features) labels = (' exon only:', ' intron only:', 'intron and exon:') for label, reads in zip(labels, results): print '%s %s' % (label, reads) pybedtools.cleanup(verbose=False)
def mergeVCFfiles(chrom, A, B, file1, file2): ## Write headers to a file. Needed later for creating a correct VCF of the intersected files. header1 = gzip.open((file1+chrom+'.vcf.gz'), 'r') header2 = gzip.open((file2+chrom+'.vcf.gz'), 'r') headerFile = (path+'HEADER_'+A+'_'+B+'_.vcf.gz') f = gzip.open(headerFile, 'ab') for line in header1: if line[0:2] != '##': break; f.write(line) header1.close() for line in header2: if line[0:2] != '##': break; f.write(line) header1.close() f.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'); f.close() ## Intersects files -LOJ a = BedTool((file1+chrom+'.vcf.gz')) b = BedTool((file2+chrom+'.vcf.gz')) a_b = a.intersect(b, header=True,loj=True) ab = a_b.saveas((path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz')) print (ab.fn) cleanup(remove_all=True) ## Intersects files -V a = BedTool((file1+chrom+'.vcf.gz')) b = BedTool((file2+chrom+'.vcf.gz')) b_a = b.intersect(a, header=True, v=True) ba = b_a.saveas((path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz')) print (ba.fn) cleanup(remove_all=True) ## CAT LOJ file and -v File LOJ = path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz' V = path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz' out = path+'concated_'+A+'_'+B+'_chr'+chrom+'.vcf.gz' os.system(("vcf-concat "+LOJ+" "+V+" | gzip -c > "+out)) #print "ok" ## correct to vcf, merge equal type samples out2 = 'stage2/unsorted_'+A+'_'+B+'_chr'+chrom+'.vcf.gz' import correctToVCF goVCF = correctToVCF.CorrectToVCF() goVCF.writeHeader(headerFile, out2) goVCF.correctFile(out, out2) ## sort the VCF file os.system(("vcf-sort "+out2+"| gzip -c > "+("germlineVCF/"+A+'_'+B+'_chr'+chrom+'.vcf.gz'))) cleanup(remove_all=True)
# special case of deletions - if the deleted area ("ispan") is a TE, more likely a TE insertion in the reference possible_te_reference_insertions = long_indel_intra_calls.pair_to_bed(tes, type="ispan", f=.75).saveas() filtered_possible_te_reference_insertions = possible_te_reference_insertions.filter(bedpe_reciprocal_overlap_ispan_filter, 0.75).saveas() print "POSSIBLE_TE_INSERTIONS_IN_REFERENCE\tALL\t" + str(len(filtered_possible_te_reference_insertions)) log.write("POSSIBLE_TE_INSERTIONS_IN_REFERENCE\tALL\t" + str(len(filtered_possible_te_reference_insertions)) + "\n") save_output(master_out_bed, filtered_possible_te_reference_insertions, output_dir, "possible_te_reference_insertions", sample_name, "POSSIBLE_TE_INSERTIONS_IN_REFERENCE", seg_dups, cent_tel) # deletions that match population variants if common_deletions is not None: common_deletions = long_indel_intra_calls.pair_to_bed(common_deletions, type="ispan", f=common_deletion_overlap_pct).saveas() filtered_possible_common_deletions = common_deletions.filter(bedpe_reciprocal_overlap_ispan_filter, common_deletion_overlap_pct).saveas() print "COMMON_DELETIONS\tALL\t" + str(len(filtered_possible_common_deletions)) log.write("COMMON_DELETIONS\tALL\t" + str(len(filtered_possible_common_deletions)) + "\n") save_output(master_out_bed, filtered_possible_common_deletions, output_dir, "possible_common_deletions", sample_name, "COMMON_DELETIONS", seg_dups, cent_tel) # insertions are shorter than the fragment size short_indel_intra_calls = expected_orientation.filter(bedpe_lt_length_filter, insert_size).saveas() print "INSERTIONS\tALL\t" + str(len(short_indel_intra_calls)) log.write("INSERTIONS\tALL\t" + str(len(short_indel_intra_calls)) + "\n") save_output(master_out_bed, short_indel_intra_calls, output_dir, "insertions", sample_name, "INSERTIONS", seg_dups, cent_tel) # inversions are what's left unexpected_orientation = intra_calls.filter(expected_orientation_filter, matches=False).saveas() print "INVERSION\tALL\t" + str(len(unexpected_orientation)) log.write("INVERSION\tALL\t" + str(len(unexpected_orientation)) + "\n") save_output(master_out_bed, unexpected_orientation, output_dir, "inversions", sample_name, "INVERSIONS", seg_dups, cent_tel) pybedtools.cleanup() log.close()
def generate_sc_intervals(bam, chromosome, workdir, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, pad=SC_PAD, min_support_ins=MIN_SUPPORT_INS, max_considered_isize=1000000000, min_support_frac_ins=MIN_SUPPORT_FRAC_INS, max_nm=SC_MAX_NM, min_matches=SC_MIN_MATCHES, isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD, svs_to_softclip=SVS_SOFTCLIP_SUPPORTED, overlap_ratio=OVERLAP_RATIO,merge_max_dist=-int(1*SC_PAD), mean_read_length=MEAN_READ_LENGTH, mean_read_coverage=MEAN_READ_COVERAGE, min_ins_cov_frac=MIN_INS_COVERAGE_FRAC, max_ins_cov_frac=MAX_INS_COVERAGE_FRAC, num_sd = 2): func_logger = logging.getLogger("%s-%s" % (generate_sc_intervals.__name__, multiprocessing.current_process())) if not os.path.isdir(workdir): func_logger.error("Working directory %s doesn't exist" % workdir) return None func_logger.info("Generating candidate intervals from %s for chromsome %s" % (bam, chromosome)) pybedtools.set_tempdir(workdir) min_isize = isize_mean - num_sd * isize_sd max_isize = isize_mean + num_sd * isize_sd unmerged_intervals = [] start_time = time.time() ignore_none = True try: sam_file = pysam.Samfile(bam, "rb") for aln in sam_file.fetch(reference=str(chromosome)): if abs(aln.tlen) > max_considered_isize: continue if not is_good_candidate(aln, min_avg_base_qual=min_avg_base_qual, min_mapq=min_mapq, min_soft_clip=min_soft_clip, max_nm=max_nm, min_matches=min_matches): continue interval = get_interval(aln, pad=pad) soft_clip_location = sum(interval) / 2 strand = "-" if aln.is_reverse else "+" svtype = infer_svtype(aln, min_isize, max_isize) if svtype == "CTX;INS": # TODO : Should be fixed to handle CTX svtype = "INS" if svtype == "DUP;ITX": # TODO : Should be fixed to handle ITX svtype = "DUP" soft_clip_tuple = find_softclip(aln) if not soft_clip_tuple: continue soft_clip, dist_L_end, dist_R_end = soft_clip_tuple other_bp = find_other_bp(aln,isize_mean, svtype, soft_clip, dist_L_end, dist_R_end, soft_clip_location) if other_bp is None: continue name = "%d,%d,%s" % (soft_clip_location, other_bp, strand) if ignore_none and svtype == "NONE": continue if svtype not in svs_to_softclip: continue unmerged_intervals.append( pybedtools.Interval(chromosome, interval[0], interval[1], name=name, score="1", strand=strand, otherfields=[svtype])) if not unmerged_intervals: sam_file.close() func_logger.warn("No intervals generated") return None unmerged_bed = os.path.join(workdir, "unmerged.bed") bedtool = pybedtools.BedTool(unmerged_intervals).sort().moveto(unmerged_bed) func_logger.info("%d candidate reads" % (bedtool.count())) bedtool_lr={"L":bedtool.filter(lambda x: int(x.name.split(",")[0])<=int(x.name.split(",")[1])).sort(), "R":bedtool.filter(lambda x: int(x.name.split(",")[0])>int(x.name.split(",")[1])).sort()} bp_merged_intervals = [] for k_bt,bt in bedtool_lr.iteritems(): merged_bed = os.path.join(workdir, "merged_%s.bed"%k_bt) m_bt=merge_for_each_sv(bt,c="4,5,6,7",o="collapse,sum,collapse,collapse", svs_to_softclip=svs_to_softclip,d=merge_max_dist, reciprocal_for_2bp=False, sv_type_field = [6,0]) m_bt = m_bt.moveto(merged_bed) func_logger.info("%d merged intervals with left bp support" % (m_bt.count())) # Check if the other break point also can be merged for the merged intervals (for 2bp SVs) for interval in m_bt: sv_type = interval.fields[6].split(',')[0] if len(set(interval.fields[6].split(',')))!=1: func_logger.warn("More than one svtypes: %s",(str(interval))) if sv_type == "INS": bp_merged_intervals.append(interval) else: name_fields_0 = interval.name.split(',') other_bps = map(lambda x:int(name_fields_0[3*x+1]), range(len(name_fields_0)/3)) if (min(other_bps)+2*pad-max(other_bps))>(-merge_max_dist): bp_merged_intervals.append(interval) continue other_bp_bedtool=bt.filter(lambda x: x.name in interval.name and x.fields[6]==sv_type).each(partial(generate_other_bp_interval,pad=pad)).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist) if len(other_bp_bedtool)==1: bp_merged_intervals.append(interval) else: for intvl in other_bp_bedtool: bp_merged_intervals.extend(bt.filter(lambda x: x.name in intvl.name and x.fields[6]==sv_type).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist)) bp_merged_bed = os.path.join(workdir, "bp_merged.bed") bedtool=pybedtools.BedTool(bp_merged_intervals).each(partial(add_other_bp_fields,pad=pad)).sort().moveto(bp_merged_bed) func_logger.info("%d BP merged intervals" % (bedtool.count())) filtered_bed = os.path.join(workdir, "filtered.bed") bedtool = bedtool.filter(lambda x: int(x.score) >= MIN_SUPPORT_SC_ONLY).each( partial(merged_interval_features, bam_handle=sam_file)).moveto( filtered_bed) func_logger.info("%d filtered intervals" % (bedtool.count())) # Now filter based on coverage coverage_filtered_bed = os.path.join(workdir, "coverage_filtered.bed") bedtool = bedtool.filter(lambda x: (x.fields[3].split(",")[1]!="INS" or ((min_ins_cov_frac*mean_read_coverage)<=(float(x.fields[6])/abs(x.start-x.end+1)*mean_read_length)<=(max_ins_cov_frac*mean_read_coverage)))).moveto(coverage_filtered_bed) func_logger.info("%d coverage filtered intervals" % (bedtool.count())) thr_sv={"INS":min_support_frac_ins, "INV":MIN_SUPPORT_FRAC_INV, "DEL":MIN_SUPPORT_FRAC_DEL, "DUP": MIN_SUPPORT_FRAC_DUP} # Add number of neighbouring reads that support SC bedtool=bedtool.each(partial(add_neighbour_support,bam_handle=sam_file, min_mapq=min_mapq, min_soft_clip=min_soft_clip, max_nm=max_nm, min_matches=min_matches, skip_soft_clip=False, isize_mean=isize_mean, min_isize=min_isize, max_isize=max_isize)).sort().moveto(coverage_filtered_bed) neigh_coverage_filtered_bed = os.path.join(workdir, "neigh_filtered.bed") bedtool = bedtool.each(partial(filter_low_frac_support,thr_sv=thr_sv)).each( partial(filter_low_neigh_read_support_INS,min_support_ins=min_support_ins)).sort().moveto( neigh_coverage_filtered_bed) func_logger.info("%d neighbour support filtered intervals" % (bedtool.count())) # For 2bp SVs, the interval will be the cover of two intervals on the BP full_filtered_bed = os.path.join(workdir, "full_neigh_filtered.bed") bedtool = bedtool.each(partial(get_full_interval,pad=pad)).sort().moveto(full_filtered_bed) func_logger.info("%d full filtered intervals" % (bedtool.count())) # Now merge on full intervals merged_full_filtered_bed = os.path.join(workdir, "merged_full.bed") if bedtool.count()>0: bedtool=merge_for_each_sv(bedtool,c="4,5,6,7,9",o="collapse,collapse,collapse,collapse,collapse", svs_to_softclip=svs_to_softclip, overlap_ratio=overlap_ratio, reciprocal_for_2bp=True, sv_type_field = [3,1], d=merge_max_dist) bedtool = bedtool.each(partial(fix_merged_fields,inter_tools=False)).each(partial(fine_tune_bps,pad=pad)) bedtool = bedtool.filter(lambda x: x.score != "-1").sort().moveto(merged_full_filtered_bed) func_logger.info("%d merged full intervals" % (bedtool.count())) sam_file.close() except Exception as e: func_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e pybedtools.cleanup(remove_all=True) func_logger.info("Generated intervals in %g seconds for region %s" % ((time.time() - start_time), chromosome)) return merged_full_filtered_bed
chr1 1 1 Region_A 2 + chr1 2 2 Region_B 1 + chr1 3 3 Region_C 3 + chr1 4 5 """.strip() b = pybedtools.BedTool(bed, from_string=True).intervals result = b.all_hits(Interval('chr1', 3, 3)) assert len(result) == 1, len(result) def test_missing_files(): """ previously this would crash the interpreter due to an exit(1) call in bedFile.cpp """ a = pybedtools.BedTool('chrA 1 10', from_string=True).saveas("this_file_should_raise_BEDTools_Error") result = list(iter(a)) os.unlink(a.fn) from pybedtools.cbedtools import BedToolsFileError def crashes(): list(iter(a)) assert_raises(BedToolsFileError, crashes) if __name__ == "__main__": unittest.main() pybedtools.cleanup(remove_all=True)
def main(): options = get_options() # pangenome dictionary rd = {} if options.roary is not None: roary = pd.read_table(options.roary, sep=',', low_memory=False) roary.set_index('Gene', inplace=True) # Drop the other info columns roary.drop(list(roary.columns[:13]), axis=1, inplace=True) roary.reset_index(inplace=True) for strain in roary.columns[1:]: for x, y, in roary.set_index( strain)['Gene'].dropna().to_dict().items(): if str(x) == 'nan': continue # be aware of paralogs for g in x.split('\t'): rd[g] = y # tmp file locations remaining_tmp = options.tmp_prefix + "/remaining_kmers.txt" remaining_next_tmp = options.tmp_prefix + "/remaining_kmers_next.txt" remaining_fa_tmp = options.tmp_prefix + "/remaining_kmers.fa" remaining_fa_next_tmp = options.tmp_prefix + "/remaining_kmers_next.fa" pybedtools.helpers.set_tempdir(options.tmp_prefix) # read references and drafts into list references = [] with open(options.references, 'r') as reference_files: for reference in reference_files: (fa, gff, ref) = reference.rstrip().split() references.append((fa, gff, ref)) output_file = open(options.output, 'w') # Open seer results # seer_remaining = seer_results seer_remaining = open(options.kmers, 'r') header = seer_remaining.readline() # Write out kmer fasta file, keep track of count kmers_remaining = 0 with open(remaining_fa_tmp, 'w') as kmer_fa: for kmer in seer_remaining: kmers_remaining += 1 kmer_fa.write(">" + str(kmers_remaining) + "\n") kmer_fa.write(kmer.split("\t")[0] + "\n") seer_remaining.seek(0) seer_remaining.readline() # for each reference, then draft ref_id = 0 for reference in references: (ref_fa, ref_gff, ref_type) = reference ref_id += 1 # print number of kmers remaining. if zero, break if kmers_remaining == 0: break sys.stderr.write(str(kmers_remaining) + " kmers remain\n") if ref_type == "ref": sys.stderr.write("Reference " + str(ref_id) + "\n") else: sys.stderr.write("Draft reference " + str(ref_id) + "\n") # index reference sequence bwa_index(ref_fa) if ref_type == "ref": bwa_algorithms = ["mem", "fastmap"] elif ref_type == "draft": bwa_algorithms = ["fastmap"] else: bwa_algorithms = ["fastmap"] sys.stderr.write("Unknown reference type " + ref_type + " for " + ref_fa + ". Assuming draft\n") # Fix ref annotation tmp_bed = tempfile.NamedTemporaryFile(prefix=options.tmp_prefix + "/") subprocess.run("gff2bed < " + ref_gff + " > " + tmp_bed.name, shell=True, check=True) ref_annotation = pybedtools.BedTool(tmp_bed.name) filtered_ref = ref_annotation.filter( lambda x: True if x[7] == "CDS" else False).saveas('tmp_bed') ref_annotation = pybedtools.BedTool('tmp_bed') for bwa_algorithm in bwa_algorithms: next_seer_remaining = open(remaining_next_tmp, 'w') next_fasta_remaining = open(remaining_fa_next_tmp, 'w') # run bwa mem -k 8 for ref, bwa fastmap for draft of remaining.fa new_idx = 0 kmer_lines = [] map_pos = {} mapped_kmers = bwa_iter(ref_fa, remaining_fa_tmp, bwa_algorithm) with tempfile.NamedTemporaryFile('w', prefix=options.tmp_prefix + "/") as query_bed: kmer_idx = 0 for mapping, kmer_line in zip(mapped_kmers, seer_remaining): if mapping.mapped: kmers_remaining -= 1 kmer_lines.append(kmer_line.rstrip()) map_pos[kmer_idx] = [] for hit_idx, (contig, start, end, strand) in enumerate(mapping.positions): map_pos[kmer_idx].append(contig + ":" + str(start) + "-" + str(end)) query_bed.write('\t'.join([ contig, str(start), str(end), str(kmer_idx) + "_" + str(hit_idx), '0', strand ]) + "\n") kmer_idx += 1 else: # if unmapped write to seer_remaining and remaining.fa next_seer_remaining.write(kmer_line) new_idx += 1 next_fasta_remaining.write(">" + str(new_idx) + "\n") next_fasta_remaining.write( kmer_line.split("\t")[0] + "\n") query_bed.flush() query_interval = pybedtools.BedTool(query_bed.name) sorted_query = query_interval.sort() in_genes = extract_genes(query_interval.intersect( b=ref_annotation, s=False, stream=True, wb=True), rd, id=options.id) up_genes = extract_genes(sorted_query.closest(b=ref_annotation, s=False, D="ref", iu=True, stream=True), rd, id=options.id) down_genes = extract_genes(sorted_query.closest( b=ref_annotation, s=False, D="ref", id=True, stream=True), rd, id=options.id) pybedtools.cleanup() # delete the bed file for kmer_idx, kmer_line in enumerate(kmer_lines): annotations = [] for hit_idx, hit in enumerate(map_pos[kmer_idx]): annotation = hit + ";" if kmer_idx in down_genes and hit_idx in down_genes[ kmer_idx]: annotation += down_genes[kmer_idx][hit_idx] annotation += ";" if kmer_idx in in_genes and hit_idx in in_genes[ kmer_idx]: annotation += in_genes[kmer_idx][hit_idx] annotation += ";" if kmer_idx in up_genes and hit_idx in up_genes[ kmer_idx]: annotation += up_genes[kmer_idx][hit_idx] annotations.append(annotation) output_file.write( "\t".join([kmer_line, ",".join(annotations)]) + "\n") # Clean up seer_remaining.close() next_seer_remaining.close() next_fasta_remaining.close() os.rename(remaining_next_tmp, remaining_tmp) os.rename(remaining_fa_next_tmp, remaining_fa_tmp) # Open next kmer file seer_remaining = open(remaining_tmp, 'r') tmp_bed.close() os.remove('tmp_bed') sys.stderr.write(str(kmers_remaining) + " kmers remain unannotated\n")
def cleanBedTool(tempPath): # do best to erase temporary bedtool files if necessary # (tempPath argument must have been created with initBedTool()) assert "TempBedTool_" in tempPath pybedtools.cleanup(remove_all=True) runShellCommand("rm -rf %s" % tempPath)
def score_edits(annotated_edits_file, bg_edits_file, output_file, conf, gene_positions_dict, genome_fa, flank, chrom_sizes_file, rdd): """ 1. Reads and filters our (annotated) editing site (fg). The "name" (edit_frac) MUST contain the edited,totalcov for each site. 2. Creates a bedtools interval (interval) containing gene coordinates. 3. Subsets our (annotated) editing list (fg) to get only the edits across one gene, for every gene. If a gene has no edit sites, pass. A. For this step, we're relying on what's been annotated by annotator. So we are only counting edits that are unambiguously assigned (edits at a given position that overlaps multiple genes in the same region are not counted). 4. Filter out bg_edits_file from fg_edits_file. 5. Open a window centered around every edit site. 6. Intersect with all edits from (3) to collect all edits that exist within the window. 7. Add up all the edited-reads and total-reads across edited sites and calculate the "edit/editedc" fraction. 8. Calculate the coverage across all C's in each window """ chrom_sizes_dict = create_chrom_sizes_dict(chrom_sizes_file) # (1) Reads and filters our (annotated) editing site (fg). This file MUST have a conf value in the 4th column. fg = read_and_filter_editing_sites(annotated_edits_file, conf) progress = trange(len(set(fg['gene_id']))) all_scores_df = pd.DataFrame(columns=[ 'chrom', 'start', 'end', 'name', 'score', 'strand', 'edit_coverage', 'editable_coverage', 'edited_over_edited_c', 'all_c_coverage', 'edited_over_all_c' ]) all_scores = [] for gene_id in set(fg['gene_id']): try: # (2) Creates a bedtools interval (interval) containing gene coordinates. interval = pybedtools.create_interval_from_list([ gene_positions_dict[gene_id]['chrom'], gene_positions_dict[gene_id]['start'], gene_positions_dict[gene_id]['end'], gene_id, '0', gene_positions_dict[gene_id]['strand'], ]) # (3) Subsets our (annotated) editing list (fg) to get only the edits across one gene, for every gene. fg_sites_in_region = fg[fg['gene_id'] == gene_id] if fg_sites_in_region.shape[0] >= 1: # thickStart = edited # fg_sites_in_region.loc[:, 'thickStart'] = fg_sites_in_region[ 'edit_frac'].apply(lambda x: int(x.split(',')[0])) # thickEnd = total coverage # fg_sites_in_region.loc[:, 'thickEnd'] = fg_sites_in_region[ 'edit_frac'].apply(lambda x: int(x.split(',')[1])) fg_sites_in_region.loc[:,'name'] = fg_sites_in_region.loc[:,'gene_id'] + \ "|" + fg_sites_in_region.loc[:,'region'] fg_sites_in_region = fg_sites_in_region[[ 'chrom', 'start', 'end', 'name', 'conf', 'strand', 'thickStart', 'thickEnd' ]] # (4) Filter out bg_edits_file from fg_edits_file. fg_prefiltered_sites_bedtool = pybedtools.BedTool.from_dataframe( fg_sites_in_region) if bg_edits_file is not None: bg_sites_bedtool = pybedtools.BedTool(bg_edits_file) fg_sites_bedtool = fg_prefiltered_sites_bedtool.sort( ).intersect(bg_sites_bedtool.sort(), s=True, v=True) else: fg_sites_bedtool = fg_prefiltered_sites_bedtool if len( fg_sites_bedtool ) > 0: # If the background file totally removes all edits from the foreground file, we might get an EmptyDataFrame # (5) Open a window centered around every edit site. fg_windows_bedtool = create_window_intervals( fg_sites_bedtool, flank, chrom_sizes_dict) # (6) Intersect with all edits from (3) to collect all edits that exist within the window. intersected_edits = fg_windows_bedtool.intersect( fg_sites_bedtool, s=True, wa=True, loj=True).to_dataframe(names=[ 'chrom', 'start', 'end', 'name', 'score', 'strand', 'edit_chrom', 'edit_start', 'edit_end', 'edit_name', 'edit_score', 'edit_strand', 'edit_coverage', 'editable_coverage' ]) # (7) Add up all the edited-reads and total-reads across edited sites and calculate the "edit/editedc" fraction. summed_confs = pd.DataFrame( intersected_edits.groupby([ 'chrom', 'start', 'end', 'name', 'score', 'strand' ])['edit_score'].sum()).reset_index() # blockCount is the "number of reads supporting an edit site" summed_edits = pd.DataFrame( intersected_edits.groupby([ 'chrom', 'start', 'end', 'name', 'score', 'strand' ])['edit_coverage'].sum()).reset_index() # editable_coverage (blockSizes) is the "total number of reads at the edited site" summed_total_coverage = pd.DataFrame( intersected_edits.groupby([ 'chrom', 'start', 'end', 'name', 'score', 'strand' ])['editable_coverage'].sum()).reset_index() df = pd.merge(summed_edits, summed_total_coverage, how='outer', left_on=[ 'chrom', 'start', 'end', 'name', 'score', 'strand' ], right_on=[ 'chrom', 'start', 'end', 'name', 'score', 'strand' ]) df['edited_over_edited_c'] = df['edit_coverage'] / df[ 'editable_coverage'] # (8) Calculate the coverage across all C's in each window df['all_c_coverage'] = df.apply(get_total_c_coverage, args=( rdd, genome_fa, ), axis=1) df['edited_over_all_c'] = df['edit_coverage'] / df[ 'all_c_coverage'] # reorder columns to match df = df[[ 'chrom', 'start', 'end', 'name', 'score', 'strand', 'edit_coverage', 'editable_coverage', 'edited_over_edited_c', 'all_c_coverage', 'edited_over_all_c' ]] all_scores.append(df) # all_scores = pd.concat([all_scores, df]) pybedtools.cleanup() except KeyError as e: pass progress.update(1) for score_df in all_scores: all_scores_df = pd.concat([all_scores_df, score_df]) all_scores_df.sort_values(by=['chrom', 'start', 'end', 'strand']).to_csv( output_file, sep='\t', index=False, header=True)
def teardown(): pybedtools.cleanup()
def teardown(): # always run this! pybedtools.cleanup(remove_all=True)
def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1, min_contig_len=AGE_MIN_CONTIG_LENGTH, max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[], min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH, age_window = AGE_WINDOW_SIZE): func_logger = logging.getLogger("%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process())) if not os.path.isdir(age_workdir): func_logger.info("Creating %s" % age_workdir) os.makedirs(age_workdir) if assembly: if not os.path.isfile("%s.fai" % assembly): func_logger.info("Assembly FASTA wasn't indexed. Will attempt to index now.") pysam.faidx(assembly) func_logger.info("Loading assembly contigs from %s" % assembly) with open(assembly) as assembly_fd: if assembly_tool == "spades": contigs = [SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>'] elif assembly_tool == "tigra": contigs = [TigraContig(line[1:]) for line in assembly_fd if line[0] == '>'] else: contigs = [] chrs = set(chrs) sv_types = set(sv_types) contig_dict = {contig.sv_region.to_tuple(): [] for contig in contigs if (len( chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and ( len(sv_types) == 0 or contig.sv_type in sv_types)} func_logger.info("Generating the contig dictionary for parallel execution") small_contigs_count = 0 for contig in contigs: if contig.sv_region.length() > max_region_len: func_logger.info("Too large SV region length: %d > %d" % (contig.sv_region.length(),max_region_len)) continue if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (len(sv_types) == 0 or contig.sv_type in sv_types): if contig.sequence_len >= min_contig_len: contig_dict[contig.sv_region.to_tuple()].append(contig) else: small_contigs_count += 1 region_list = sorted(contig_dict.keys()) nthreads = min(nthreads, len(region_list)) if nthreads == 0: func_logger.warning("AGE not run since no contigs found") return None func_logger.info("Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % ( len(region_list), sum([len(value) for value in contig_dict.values()]), small_contigs_count, nthreads)) pybedtools.set_tempdir(age_workdir) pool = multiprocessing.Pool(nthreads) breakpoints_beds = [] for i in xrange(nthreads): region_sublist = [region for (j, region) in enumerate(region_list) if (j % nthreads) == i] kwargs_dict = {"intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict, "reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir, "timeout": timeout, "keep_temp": keep_temp, "myid": i, "min_del_subalign_len": min_del_subalign_len, "min_inv_subalign_len": min_inv_subalign_len, "age_window" : age_window} pool.apply_async(run_age_single, args=[], kwds=kwargs_dict, callback=partial(run_age_single_callback, result_list=breakpoints_beds)) pool.close() pool.join() func_logger.info("Finished parallel execution") func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds))) pybedtools.cleanup(remove_all=True) if not breakpoints_beds: return None bedtool = pybedtools.BedTool(breakpoints_beds[0]) for bed_file in breakpoints_beds[1:]: bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False) bedtool = bedtool.moveto(os.path.join(age_workdir, "breakpoints_unsorted.bed")) merged_bed = os.path.join(age_workdir, "breakpoints.bed") bedtool.sort().saveas(merged_bed) return merged_bed
def format_hartwig(mutation_file, cnvs_file, purity_file, outfile): # load files and preformat them df, cnv_bed, purity_score, gender = load_files(mutation_file, cnvs_file, purity_file) # this is the sample column lastcol = list(df.columns)[-1] # get total reads df_reads = df.apply(get_reads, axis=1, args=([lastcol])) # select whether we have SNVs or others df_reads['len_alt'] = df_reads['ALT'].str.len() # number of characters in ref df_reads['len_ref'] = df_reads['REF'].str.len() # first classification between SNV and others df_reads['TYPE'] = df_reads.apply(lambda x: 'SNV' if ( (x['len_alt'] == 1) and (x['len_ref'] == 1) and (x['ALT'] != '-') and (x['REF'] != '-')) else 'INDEL', axis=1) df_reads['pos-1'] = df_reads['POS'] - 1 # get the triplet df_reads['TRIPLET'] = df_reads.apply( lambda x: hg19(x['CHROM'], x['pos-1'], 3), axis=1) df_reads['EXTENDED'] = df_reads.apply( lambda x: hg19(x['CHROM'], int(x['POS']) - 2, 5), axis=1) snv_df = df_reads[df_reads['TYPE'] != 'INDEL'] snv_df['CLASS'] = 'SNV' snv_df['VARIANT_CLASS'] = snv_df.apply(create_snv_class, axis=1) # classify indels indel_df = df_reads[df_reads['TYPE'] == 'INDEL'] indels = indels_classification(indel_df) columns = indels.columns df_reads_merged = pd.concat([snv_df, indels], sort=True) df_reads_merged = df_reads_merged[columns] # assing the name of the sample df_reads_merged['sample'] = lastcol # create bed file mut_bed = BedTool.from_dataframe(df_reads_merged[[ 'CHROM', 'pos-1', 'POS', 'ref_reads', 'var_reads', 'VAF', 'total_reads', 'REF', 'ALT', 'sample', 'TYPE', 'CLASS', 'VARIANT_CLASS', 'TRIPLET', 'EXTENDED' ]]) # Remove unmappable regions mapped = get_mappable_regions(mut_bed) # intersect with CN data out = mapped.intersect(cnv_bed, wao=True) # merge to dataframe merge = out.to_dataframe(names=[ 'CHROM', 'POS-1', 'POS', 'REF_COUNTS', 'VAR_COUNTS', 'VAF', 'TOTAL_READS', 'REF', 'ALT', 'SAMPLE', 'TYPE', 'CLASS', 'VARIANT_CLASS', 'TRIPLET', 'EXTENDED', 'c1', 'p1', 'p2', 'MAJOR_CN_TEMP', 'actual_Baf', 'overlapp' ]) # get the normal copy number values sex_chrom = ('Y', 'X') # get normal CN in the chromosome merge['NORMAL_CN'] = merge['CHROM'].apply( lambda x: 1 if x in sex_chrom and gender == "MALE" else 2) # add the purity score we got from PURPLE merge['PURITY'] = purity_score merge['GENDER'] = gender # get number of CNAs, if no overlapp then get the normal count merge['TOTAL_CN'] = merge.apply(get_major_cn, axis=1) # formula of allele specific copy number according to hartwig's people merge['MAJOR_CN'] = round(merge['actual_Baf'] * merge['TOTAL_CN']).astype(int) merge['MINOR_CN'] = round( (1 - merge['actual_Baf']) * merge['TOTAL_CN']).astype(int) merge['CHROM'] = merge['CHROM'].apply(lambda x: 'chr{}'.format(x)) # save files merge.dropna()[[ 'CHROM', 'POS', 'REF', 'ALT', 'TRIPLET', 'EXTENDED', 'CLASS', 'VARIANT_CLASS', 'SAMPLE', 'MAJOR_CN', 'MINOR_CN', 'TOTAL_CN', 'NORMAL_CN', 'VAR_COUNTS', 'REF_COUNTS', 'GENDER', 'PURITY' ]].to_csv(outfile, sep='\t', index=False, header=True, compression='gzip') # clean BedTools temp files pybedtools.cleanup()
def parallel_generate_sc_intervals(bams, chromosomes, skip_bed, workdir, num_threads=1, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD, min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS): func_logger = logging.getLogger( "%s-%s" % (parallel_generate_sc_intervals.__name__, multiprocessing.current_process())) if not os.path.isdir(workdir): func_logger.info("Creating directory %s" % workdir) os.makedirs(workdir) if not chromosomes: func_logger.info("Chromosome list unspecified. Inferring from the BAMs") for bam in bams: bamfile = pysam.Samfile(bam, "rb") chromosomes += list(bamfile.references) bamfile.close() chromosomes = sorted(list(set(chromosomes))) func_logger.info("Chromosome list inferred as %s" % (str(chromosomes))) if not chromosomes: func_logger.error("Chromosome list empty") return None pool = multiprocessing.Pool(num_threads) bed_files = [] for index, (bam, chromosome) in enumerate(itertools.product(bams, chromosomes)): process_workdir = os.path.join(workdir, str(index)) if not os.path.isdir(process_workdir): os.makedirs(process_workdir) args_list = [bam, chromosome, process_workdir] kwargs_dict = {"min_avg_base_qual": min_avg_base_qual, "min_mapq": min_mapq, "min_soft_clip": min_soft_clip, "max_soft_clip": max_soft_clip, "pad": pad, "min_support": min_support, "min_support_frac": min_support_frac} pool.apply_async(generate_sc_intervals, args=args_list, kwds=kwargs_dict, callback=partial(generate_sc_intervals_callback, result_list=bed_files)) pool.close() pool.join() func_logger.info("Following BED files will be merged: %s" % (str(bed_files))) if not bed_files: func_logger.warn("No intervals generated") return None pybedtools.set_tempdir(workdir) bedtool = pybedtools.BedTool(bed_files[0]) for bed_file in bed_files[1:]: bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False) bedtool = bedtool.moveto(os.path.join(workdir, "all_intervals.bed")) func_logger.info("Selecting the top %d intervals based on normalized read support" % max_intervals) top_intervals_all_cols_file = os.path.join(workdir, "top_intervals_all_cols.bed") if bedtool.count() <= max_intervals: bedtool = bedtool.saveas(top_intervals_all_cols_file) else: # Sample the top intervals top_fraction_cutoff = sorted([float(interval.score) / float(interval.fields[6]) for interval in bedtool], reverse=True)[max_intervals-1] bedtool = bedtool.filter(lambda x: float(x.score) / float(x.fields[6]) >= top_fraction_cutoff).moveto(top_intervals_all_cols_file) # Filter out the extra column added to simplify life later on bedtool = bedtool.cut(xrange(6)).saveas(os.path.join(workdir, "top_intervals.bed")) if skip_bed: skip_bedtool = pybedtools.BedTool(skip_bed) func_logger.info( "Merging %d features with %d features from %s" % (bedtool.count(), skip_bedtool.count(), skip_bed)) bedtool = skip_bedtool.cat(bedtool, postmerge=False).sort() func_logger.info("After merging with %s %d features" % (skip_bed, bedtool.count())) bedtool = bedtool.saveas(os.path.join(workdir, "intervals.bed")) pybedtools.cleanup(remove_all=True) return bedtool.fn
def filter_by_covr(cov_filename, cov_site_min, cov_gene_min, gtffile, genomic_regions, tmp_dir): """ Wrapper for filtering nodes. Filter based on minimum unique read counts and minimum gene expression. """ node_list = [ chr_strand + ':' + str(start) + ':' + str(end) for chr_strand in genomic_regions for start, end in genomic_regions[chr_strand] ] covfile = pysam.Samfile(cov_filename) gene_preserved = deque() site_preserved = deque() if cov_site_min > 0: k = 0 for chr_strand in genomic_regions: chr, strand = chr_strand.split(':') print_time_stamp('filtering site: ' + chr_strand) for start, end in genomic_regions[chr_strand]: k += 1 if not k % 10000: print_time_stamp('filtering site count: ' + str(k) + '/' + str(len(node_list))) node = chr_strand + ':' + str(start) + ':' + str(end) num_reads = sum([ 1 for x in covfile.fetch(chr, int(start), int(end)) if x.pos > start and x.pos < end ]) if num_reads >= cov_site_min: site_preserved.add(node) else: site_preserved = set(node_list) if cov_gene_min > 0: genomic_regions_list = [ (chr_strand.split(':')[0], int(start), int(end), chr_strand + ':' + ':'.join([str(start), str(end)]), 'X', chr_strand.split(':')[1]) for chr_strand in genomic_regions for start, end in genomic_regions[chr_strand] ] genomic_regions_bed = pybedtools.BedTool(genomic_regions_list) gtf = pybedtools.BedTool(gtffile) overlap_transcripts = genomic_regions_bed.intersect(gtf, wo=True, s=True) overlap_transcripts.saveas(tmp_dir + '/genomic_regions.gtf.bed') total = len(overlap_transcripts) pybedtools.cleanup() del overlap_transcripts del gtf del genomic_regions_list cov_scale = sum([ int(x.split('\t')[2]) for x in pysam.idxstats(cov_filename).split('\n') if len(x) > 0 ]) / 1000000.0 #gene_fpkm=read_cufflinks('/u/home/f/frankwoe/scratch/Ule_RNAseq_hnRNPC/cufflinks_output_star/genes.fpkm_tracking') gene_rpkm = {} k = 0 f = open(tmp_dir + '/genomic_regions.gtf.bed', 'r') for ele in f: line = ele.split() k += 1 if not k % 10000: print_time_stamp('filtering gene RPKM: ' + str(k) + '/' + str(total)) node = line[3] #if not node in site_preserved: # continue gene_id = line[9] #RPKM = gene_fpkm[gene_id] if gene_id in gene_fpkm else 0 if gene_id in gene_rpkm: RPKM = gene_rpkm[gene_id] else: chr, start, end = line[6], line[7], line[8] transcript_count = covfile.count(chr, int(start), int(end)) block_sizes = [int(x) for x in line[16].split(',') if x != ''] gene_len = sum(block_sizes) / 1000.0 RPKM = transcript_count / cov_scale / gene_len gene_rpkm[gene_id] = RPKM if RPKM >= cov_gene_min: gene_preserved.append(node) gene_preserved = set(gene_preserved) f.close() else: gene_preserved = set(node_list) return gene_preserved.intersection(site_preserved)
if options.snpformat == "VCFID": snpid = str(line_vcf[2]) else: snpid = str(line_vcf[0].lstrip("chr")) + ":" + str( line_vcf[1]) + ":" + str(line_vcf[3]) + ":" + str( line_vcf[4]) if snpid in allsnplist: counts = findcarriers(line_vcf, options.gtfield, options.snpformat, sampleindices, options.maxAC, options.maxAF, options.minAN) if counts[2] > 0: count_table[snpid] = [ snpid, counts[0], counts[1], counts[2] ] pybedtools.cleanup() #Generate output counts outfile = open(options.outfilename, "w") outfile.write( "#GENE\tCASE_COUNT_HET\tCASE_COUNT_CH\tCASE_COUNT_HOM\tCASE_TOTAL_AC\n") snpfile = open(options.snpfilename, "r") for line_s1 in snpfile: line_s = line_s1.rstrip('\n').split('\t') if line_s[0][0] != "#": genesnplist = list(set(line_s[1].split(','))) counts = calculatecount(genesnplist, count_table) outfile.write(line_s[0] + "\t" + str(counts[0]) + "\t" + str(counts[1]) + "\t" + str(counts[2]) + "\t" + str(counts[3]) + '\n') outfile.close()
## retrieve bin-value relationships sampleName = args.name[index] binValDict = RunMetagene(inputBedDict, args, kwargs) ## Deletes all temp files from the current session pybedtools.cleanup(verbose=False, remove_all=False) if iboolDict['bam']: tempFile.close() return [sampleName, binValDict] # main program if __name__ == '__main__': ## setting temporary dir for pybedtools pybedtools.set_tempdir(args.temp) if args.deltmp: pybedtools.cleanup(verbose=False, remove_all=True) ## judge arguments FileExist([args.anno], 'anno') if args.reverse and args.strand: sys.exit('--reverse and --strand should not be True at the same time!') if args.gene != 'protein_coding' and args.feature in [ 'coding', 'utr5', 'cds', 'utr3' ]: sys.exit( '--feature should not be ["coding", "utr5", "cds", "utr3"] when --gene is not "protein_coding"!' ) ## bam and bed judgement iboolDict = defaultdict(bool) ibool = 0 if bool(args.bed): FileExist(args.bed, 'bed')