def run(parser): args = parser.parse_args() reffile = args.reference if args.reference == 'hg19': reffile = './hg19.tss.bed' elif args.reference == 'mm10': reffile = './mm10.tss.bed' ref = pb.BedTool(reffile) meth = pb.BedTool(args.bedfile) methtss = ref.window(meth, l=args.upstream, r=args.downstream).groupby(g=[1, 2, 3, 4, 5, 6], c=9, o=['mean']) with open(args.RNAseq) as f: lines = f.readlines() dic = {} for line in lines: t = line.strip().split() dic[t[0]] = [float(t[1])] for m in methtss: if m[3] in dic: #postion of genename dic[m[3]].append(float(m[-1])) #print(dic) plt.figure() rexp = [] mlevel = [] for d in dic: if len(dic[d]) != 2: continue rexp.append(dic[d][0]) mlevel.append(dic[d][1]) rexp = np.array(rexp) mlevel = np.array(mlevel) pos = np.where(rexp > -100) mlevel = mlevel[pos] rexp = rexp[pos] #rexp[np.where(rexp==0)]=0.01 #low=np.sort(rexp)[len(rexp)//10] ##log transform the gene expression values rexp = np.log(rexp + 1) / np.log(10) max_exp = np.max(rexp) min_exp = np.nanmin(rexp) max_mlevel = np.max(mlevel) min_mlevel = np.min(mlevel) #i#print(low) #print(np.where(rexp>low)[0]) # plt.plot(rexp,mlevel,'b.',alpha=0.2) #t = np.arange(len(rexp)) plt.scatter(mlevel, rexp, c='b', alpha=0.1) #plt.colorbar() #<<<<<<< HEAD #plt.xlim(0,15) plt.ylim(min_exp * 1.03, max_exp * 1.03) #plt.ylim(-1,1) plt.xlim(min_mlevel * 1.05, max_mlevel * 1.05) #plt.xlabel('Gene Expression Level (Log10)') #plt.ylabel('Methylation Ratio') plt.xlabel(args.xaxislabel) plt.ylabel(args.yaxislabel) #======= # plt.xlim(0,max_exp) # plt.ylim(0,1) # plt.xlabel('Gene Expression Level') # plt.ylabel('Ratio') #>>>>>>> ee2d90f8cae2348451a0949bf28b02bc12f2d1ae # plt.plot([0,np.max(rexp)],[1,0],'r-') spearman, p1 = spearmanr(rexp, mlevel) pearson, p2 = pearsonr(rexp, mlevel) geneNum = len(rexp) #print geneNum #Decimal(str(r[j])).quantize(Decimal('0.00')) from decimal import Decimal s1 = 'Spearman correlation Coefficient: ' + str( Decimal(str(spearman)).quantize( Decimal('0.000'))) + ' p-value: ' + str( Decimal(str(p1)).quantize(Decimal('0.000'))) s2 = 'Pearson correlation Coefficient: ' + str( Decimal(str(pearson)).quantize(Decimal('0.000'))) + ' p-value: ' + str( Decimal(str(p2)).quantize(Decimal('0.000'))) s3 = 'Total Genes:' + str(geneNum) #plt.text() #plt.text(0,1.1,s2) #plt.text(0,1.05,s1) plt.text(min_mlevel, max_exp * 0.9, s3) plt.text(min_mlevel, max_exp * 1.1, s2) plt.text(min_mlevel, max_exp * 1.05, s1) plt.savefig(args.output + '.pdf')
def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1, min_contig_len=AGE_MIN_CONTIG_LENGTH, max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[]): func_logger = logging.getLogger( "%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process())) if not os.path.isdir(age_workdir): func_logger.info("Creating %s" % age_workdir) os.makedirs(age_workdir) if not os.path.isfile("%s.fai" % assembly): func_logger.info( "Assembly FASTA wasn't indexed. Will attempt to index now.") pysam.faidx(assembly) func_logger.info("Loading assembly contigs from %s" % assembly) with open(assembly) as assembly_fd: if assembly_tool == "spades": contigs = [ SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>' ] elif assembly_tool == "tigra": contigs = [ TigraContig(line[1:]) for line in assembly_fd if line[0] == '>' ] chrs = set(chrs) sv_types = set(sv_types) contig_dict = { contig.sv_region.to_tuple(): [] for contig in contigs if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and ( len(sv_types) == 0 or contig.sv_type in sv_types) } func_logger.info("Generating the contig dictionary for parallel execution") small_contigs_count = 0 for contig in contigs: if contig.sv_region.length() > max_region_len: continue if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and ( len(sv_types) == 0 or contig.sv_type in sv_types): if contig.sequence_len >= min_contig_len: contig_dict[contig.sv_region.to_tuple()].append(contig) else: small_contigs_count += 1 region_list = sorted(contig_dict.keys()) nthreads = min(nthreads, len(region_list)) func_logger.info( "Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % (len(region_list), sum([len(value) for value in contig_dict.values() ]), small_contigs_count, nthreads)) pybedtools.set_tempdir(age_workdir) pool = multiprocessing.Pool(nthreads) breakpoints_beds = [] for i in xrange(nthreads): region_sublist = [ region for (j, region) in enumerate(region_list) if (j % nthreads) == i ] kwargs_dict = { "intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict, "reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir, "timeout": timeout, "keep_temp": keep_temp, "myid": i } pool.apply_async(run_age_single, args=[], kwds=kwargs_dict, callback=partial(run_age_single_callback, result_list=breakpoints_beds)) pool.close() pool.join() func_logger.info("Finished parallel execution") func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds))) pybedtools.cleanup(remove_all=True) if not breakpoints_beds: return None bedtool = pybedtools.BedTool(breakpoints_beds[0]) for bed_file in breakpoints_beds[1:]: bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False) merged_bed = os.path.join(age_workdir, "breakpoints.bed") bedtool.sort().saveas(merged_bed) return merged_bed
def _save_dict(bed, out_fname, val_index=None): """Save data from dict to BED file.""" sites = pybedtools.BedTool(_iter_bed_dict(bed, val_index=val_index)).saveas() sites1 = sites.sort().saveas(out_fname) return sites1
import random import numpy import pybedtools import operator from operator import itemgetter from pybedtools import BedTool count_list = [] total_list = [] average_list = [] sorted_list = [] token_list = [] print('Loading variants.\n') b = pybedtools.BedTool('cosmicchr1.bed').sort() print('Loaded ' + str(b.count())) num_trials = 10 max_rand_shift = 1000 pre_test = "" print('Loading matches.\n') with open("matchestest.txt", "r") as m: matches = m.readlines() for line in matches: tokens = line.split('\t') # chromo = tokens[0] # start = int(tokens[1]) # end=int(tokens[2]) # name=tokens[3] #pre_test = pre_test + str(token_list)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-v', '--variants', required=True, help='Default VCF') parser.add_argument('-r', '--RDtest') parser.add_argument('-b', '--BAFtest') parser.add_argument('-s', '--SRtest') parser.add_argument('-p', '--PEtest') parser.add_argument('--batch-list', type=argparse.FileType('r')) parser.add_argument('--segdups', required=True) parser.add_argument('--rmsk', required=True) parser.add_argument('--fam') parser.add_argument('-d', '--bed', action='store_true', default=False) parser.add_argument('fout') args = parser.parse_args() if args.bed: if not hasattr(args, 'batch_list'): raise Exception('batch list must be specified when passing a bed') variants = open(args.variants) dtypes = 'RD BAF'.split() else: variants = pysam.VariantFile(args.variants) dtypes = 'PE SR RD BAF'.split() metadata = process_metadata(variants, args.bed, args.batch_list) # Calculate segdup coverage bt = pbt.BedTool.from_dataframe(metadata['chrom start end'.split()]) segdups = pbt.BedTool(args.segdups) cov = bt.coverage(segdups).to_dataframe() metadata['poor_region_cov'] = cov.thickStart # Check if endpoints are in repeat-masked sequence starts = metadata['chrom start end name'.split()].copy() starts['end'] = starts['start'] + 1 ends = metadata['chrom start end name'.split()].copy() ends['start'] = ends['end'] - 1 endpoints = pd.concat([starts, ends]) bt = pbt.BedTool.from_dataframe(endpoints) rmsk = pbt.BedTool(args.rmsk) sect = bt.intersect(rmsk, u=True) rmasked_names = [i.fields[3] for i in sect.intervals] metadata['rmsk'] = metadata.name.isin(rmasked_names) metadata = metadata.set_index('name') evidence = deque() for dtype in dtypes: dtable = getattr(args, dtype + 'test') if dtable is None: continue df = pd.read_table(dtable) df = preprocess(df, dtype) df = df.rename(columns=lambda c: dtype + '_' + c if c != 'name' else c) df = df.set_index('name') evidence.append(df) evidence = list(evidence) evidence = metadata.join(evidence, how='outer', sort=True) evidence = evidence.reset_index().rename(columns={'index': 'name'}) has_petest = (getattr(args, 'PEtest') is not None) has_srtest = (getattr(args, 'SRtest') is not None) if not args.bed and has_petest and has_srtest: evidence = add_pesr(evidence) # Replace infinite log-pvals LOG_CEIL = 300 evidence = evidence.replace(np.inf, LOG_CEIL) evidence = evidence.reindex(columns=make_columns()) evidence.to_csv(args.fout, index=False, sep='\t', na_rep='NA')
def get_aligned_reads_from_multi_mp(obj, nproc, passed_cells): global my_read_dict global my_intersect_all global my_ex_coord global my_uniq_r_bclist my_intersect_all = None my_read_dict = None my_ex_coord = None my_uniq_r_bclist = obj.uniq_r_bclist.copy() samfile = pysam.AlignmentFile(obj.in_bam_multi, "rc") try: r_iterator = samfile.fetch(obj.chrom, int(obj.start), int(obj.end)) except: return obj rcds = np.array([[r_idx, x.to_dict(), x.get_blocks()] for r_idx, x in enumerate(r_iterator) if x.flag in obj.strand_flags[obj.strand] and list(filter(regx1.match, x.to_dict()['tags']))[0].replace( 'BC:Z:', '') in passed_cells]) pool = mp.Pool(processes=nproc) func = partial(_make_dict2_mp, obj.chrom, obj.strand, obj.gene) read_dict_list = pool.map(func, rcds, chunksize=1) pool.close() my_read_dict = {} tmp = [ my_read_dict.update(elemt) for elemt in read_dict_list if elemt is not None ] # fast!! df = [my_read_dict[r_idx]['r_blocks'] for r_idx in my_read_dict.keys()] samfile.close() if len(df) == 0: return obj pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_multi_reads_blocks.bed' % (obj.outdir, obj.gene), index=False, sep="\t", header=False) read_bed = pybedtools.BedTool('%s/.tempDir/_%s_multi_reads_blocks.bed' % (obj.outdir, obj.gene)) tmp = obj.ex_bed.intersect(read_bed, wa=True, wb=True) if os.stat(tmp.fn).st_size == 0: return obj my_intersect_all = tmp.to_dataframe() read_idx_list = list(set(my_intersect_all.iloc[:, 9].values)) my_ex_coord = ','.join( obj.exons.apply(lambda x: '%s-%s' % (x[1], x[2]), axis=1).values) pool = mp.Pool(processes=nproc, initializer=_initialize_make_list_aligned) aligned_reads = pool.map(_make_list_aligned_reads_mp, read_idx_list, chunksize=1) pool.close() colnames = [ 'name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar', 'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags', 'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB', 'exon_coordinates' ] obj.multi_aligned_reads = pd.DataFrame(aligned_reads, columns=colnames).drop_duplicates() obj.multi_aligned_reads.insert(19, 'MapFlag', 'multi') return obj
def get_bedtools_features(strFileName): btFeatures = pbt.BedTool(strFileName) return btFeatures
def parallel_genotype_intervals(intervals_file=None, bam=None, workdir=None, nthreads=1, chromosomes=[], window=DEFAULT_GT_WINDOW, isize_mean=DEFAULT_GT_ISIZE_MEAN, isize_sd=DEFAULT_GT_ISIZE_SD, normal_frac_threshold=DEFAULT_GT_NORMAL_FRAC): func_logger = logging.getLogger("%s-%s" % (parallel_genotype_intervals.__name__, multiprocessing.current_process())) if workdir and not os.path.isdir(workdir): os.makedirs(workdir) chromosomes = set(chromosomes) start_time = time.time() bedtool = pybedtools.BedTool(intervals_file) selected_intervals = [ interval for interval in bedtool if not chromosomes or interval.chrom in chromosomes ] nthreads = min(len(selected_intervals), nthreads) intervals_per_process = (len(selected_intervals) + nthreads - 1) / nthreads pool = multiprocessing.Pool(nthreads) genotyped_beds = [] for i in xrange(nthreads): process_workdir = os.path.join(workdir, str(i)) if not os.path.isdir(process_workdir): os.makedirs(process_workdir) process_intervals = pybedtools.BedTool( selected_intervals[i * intervals_per_process:(i + 1) * intervals_per_process]).saveas( os.path.join(process_workdir, "ungenotyped.bed")) kwargs_dict = { "intervals_file": process_intervals.fn, "bam": bam, "workdir": process_workdir, "window": window, "isize_mean": isize_mean, "isize_sd": isize_sd, "normal_frac_threshold": normal_frac_threshold } pool.apply_async(genotype_intervals, kwds=kwargs_dict, callback=partial(genotype_intervals_callback, result_list=genotyped_beds)) pool.close() pool.join() func_logger.info("Following BED files will be merged: %s" % (str(genotyped_beds))) if not genotyped_beds: func_logger.warn("No intervals generated") return None pybedtools.set_tempdir(workdir) bedtool = pybedtools.BedTool(genotyped_beds[0]) for bed_file in genotyped_beds[1:]: bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False) bedtool = bedtool.sort().moveto(os.path.join(workdir, "genotyped.bed")) func_logger.info( "Finished parallel genotyping of %d intervals in %g minutes" % (len(selected_intervals), (time.time() - start_time) / 60.0)) return bedtool.fn
def binned_stats(self, in_fname, nbins, split=False, **args): rpkm = args.get("rpkm", False) readlength = self.read_length() fragmentsize = self.fragmentsize if not fragmentsize: fragmentsize = readlength total_reads = 1 if rpkm: total_reads = self.count() / 1000000.0 ret = [] count = 1 # Only use a BedTool if really necessary, as BedTools does not close open files # on object deletion if self.ftype == "bam": in_track = SimpleBed(in_fname) else: in_track = pybedtools.BedTool(in_fname) #extend = fragmentsize - readlength for feature, min_strand, plus_strand in self.fetch_to_counts(in_track): binsize = (feature.end - feature.start) / float(nbins) row = [] min_strand = [x - (fragmentsize - readlength) for x in min_strand] bin_start = feature.start while int(bin_start + 0.5) < feature.end: num_reads = 0 i = 0 c = 0 while i < len(min_strand) and min_strand[i] <= int( bin_start + binsize + 0.5): if min_strand[i] + fragmentsize <= int(bin_start + binsize + 0.5): c += 1 num_reads += 1 i += 1 min_strand = min_strand[c:] i = 0 c = 0 while i < len(plus_strand) and plus_strand[i] <= int( bin_start + binsize + 0.5): if plus_strand[i] + fragmentsize <= int(bin_start + binsize + 0.5): c += 1 num_reads += 1 i += 1 plus_strand = plus_strand[c:] if rpkm: per_kb = num_reads * (1000.0 / binsize) row.append(per_kb / total_reads) else: row.append(num_reads) bin_start += binsize if feature.strand == "-": row = row[::-1] ret.append([feature.chrom, feature.start, feature.end] + row) count += 1 del in_track if split: return ret else: return ["\t".join([str(x) for x in r]) for r in ret]
index=False, compression="gzip") return None ###################################### MAIN ###################################### snake_log_obj = snakemake.log # class(snakemake.log) = 'snakemake.io.Log sys.stdout = open( str(snake_log_obj), "w" ) # could not find better ways than calling str(snake_log_obj) to get the log filepath out_dir = snakemake.params['out_dir'] out_prefix = snakemake.params['run_prefix'] annot_per_geneset = False chromosome = snakemake.params['chromosome'] annotations = snakemake.params['annotations'] all_genes = snakemake.params['all_genes'] bimfile = '{}.{}.bim'.format(snakemake.params['bfile'], chromosome) dict_of_beds = {} for name_annot in annotations: dict_of_beds[name_annot] = pybedtools.BedTool('{}/{}.{}.bed'.format( out_dir + '/bed', out_prefix, name_annot)) make_annot_file_per_chromosome(chromosome, dict_of_beds, out_dir, out_prefix, annot_per_geneset, bimfile, all_genes) print("Make annot script is done!")
def make_annot_file_per_chromosome(chromosome, dict_of_beds, out_dir, out_prefix, annot_per_geneset, bimfile, all_genes): """ Input chromosome: integer (1..22) *OBS* this function RELIES on MANY GLOBAL scope VARIABLES """ # TODO: parse variables to function ### make annot file print('making annot files for chromosome {}'.format(chromosome)) df_bim = pd.read_csv(bimfile, delim_whitespace=True, usecols=[0, 1, 2, 3], names=['CHR', 'SNP', 'CM', 'BP']) # (Pdb) df_bim.head() # CHR SNP CM BP # 0 21 rs146134162 -0.908263 9412099 # 1 21 rs578050168 -0.908090 9412377 # 2 21 rs527616997 -0.907297 9413645 # 3 21 rs544748596 -0.906578 9414796 # 4 21 rs528236937 -0.906500 9414921 # iter_bim = [['chr'+str(x1), x2, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])] # ^ Python3 (but not Python2.7) gives the following error when calling "bimbed = BedTool(iter_bim)" in make_annot_file_per_chromosome() # /tools/anaconda/3-4.4.0/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.IntervalIterator.__next__() # /tools/anaconda/3-4.4.0/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.create_interval_from_list() # /tools/anaconda/3-4.4.0/lib/python3.6/site-packages/pybedtools/cbedtools.pyx in pybedtools.cbedtools.isdigit() # AttributeError: 'numpy.int64' object has no attribute 'isdigit' # SOLUTION: convert everything to strings --> ['chr'+str(x1), str(x2), str(x2)] # print(df_bim.head) --> useful for debugging # print(bimfile) iter_bim = [['chr' + str(x1), str(x2), str(x2)] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])] bimbed = pybedtools.BedTool(iter_bim) counter = 1 # just to print status message list_df_annot = [] for name_annotation in sorted( dict_of_beds): # we sort to make output more consistent. print("CHR={} | annotation={}, #{}/#{}".format(chromosome, name_annotation, counter, len(dict_of_beds))) bed_for_annot = dict_of_beds[name_annotation] # get bed # (Pdb)len(bed_for_annot) # 66 # (Pdb) bed_for_annot.head() # chr1 51619935 52185000 # chr1 70410488 70871303 # chr1 85584164 86243933 # chr1 202948059 203355877 # chr10 43851792 44270066 # chr10 75681524 76110821 # chr10 76769912 77191206 # chr10 120663598 121138345 # chr11 118030300 118469926 # chr12 21454715 21871342 annotbed = bimbed.intersect( bed_for_annot, wb=True ) # PT NOTE: this finds SNPs in bim file that OVERLAP with the annotation bed (gene) # chr22 24008141 24008141 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008403 24008403 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008409 24008409 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008465 24008465 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008495 24008495 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008497 24008497 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008503 24008503 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008699 24008699 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # chr22 24008773 24008773 chr22 24008021 24210630 ENSG00000250479 0.03038823367 # annotbed = bed_for_annot.intersect(bimbed, wb=True) # PT NOTE: this finds the positions/intervals in annotation bed (gene) that OVERLAP with the bim file. Only the part of the record intersections occurred # *IMPORTANT*: bimbed.intersect(bed_for_annot) and bed_for_annot.intersect(bimbed) DOES NOT return the same positions. However, they do return the same number of 'intersected features'. That is, the returned BedTool object as the same length. # bed_for_annot.intersect(bimbed) returns features that span two bp (e.g. start=24008140, end=24008142), whereas bimbed.intersect(bed_for_annot) returns features that span a single bp (start=24008141, end=24008141) # use bed_for_annot.intersect(bimbed, wb=True) to understand this behavior better. # chr22 24008140 24008142 ENSG00000250479 0.03038823367 chr22 24008141 24008141 # chr22 24008402 24008404 ENSG00000250479 0.03038823367 chr22 24008403 24008403 # chr22 24008408 24008410 ENSG00000250479 0.03038823367 chr22 24008409 24008409 # chr22 24008464 24008466 ENSG00000250479 0.03038823367 chr22 24008465 24008465 # chr22 24008494 24008496 ENSG00000250479 0.03038823367 chr22 24008495 24008495 # chr22 24008496 24008498 ENSG00000250479 0.03038823367 chr22 24008497 24008497 # chr22 24008502 24008504 ENSG00000250479 0.03038823367 chr22 24008503 24008503 # chr22 24008698 24008700 ENSG00000250479 0.03038823367 chr22 24008699 24008699 # chr22 24008772 24008774 ENSG00000250479 0.03038823367 chr22 24008773 24008773 ### DOCS .intersect() # the intervals reported are NOT the original gene intervals, but rather a refined interval reflecting solely the portion of each original gene interval that overlapped with the SNPs # The -wa (write A) and -wb (write B) options allow one to see the original records from the A and B files that overlapped. As such, instead of not only showing you where the intersections occurred, it shows you what intersected. # SEE MORE HERE: http://quinlanlab.org/tutorials/bedtools/bedtools.html # SEE https://daler.github.io/pybedtools/intersections.html # SEE https://daler.github.io/pybedtools/autodocs/pybedtools.bedtool.BedTool.intersect.html#pybedtools.bedtool.BedTool.intersect ### Extract data from annotbed before deleting annotbed.fn ### These iterations REQUIRE the tmp bed file (annotbed.fn) to exist. SEE cbedtools.IntervalFile or .cbedtools.IntervalIterator, ### Since the iterables in annotbed (x.start or .fields[7]) are all strings (or integers?), they are immutable objects and python will insert the value of the object/string, not a reference, in the list comprehension ### CONCLUSION: pass-by-value of immutable objects means that we can SAFELY DELETE annotbed.fn after this data. ### REF Parameter Passing for Mutable & Immutable Objects: https://medium.com/@tyastropheus/tricky-python-ii-parameter-passing-for-mutable-immutable-objects-10e968cbda35 bp = [ x.start for x in annotbed ] # PT NOTE: make list of all bp positions for the overlapping SNPs | All features, no matter what the file type, have chrom, start, stop, name, score, and strand attributes. annotation_value = [ x.fields[7] for x in annotbed ] # returns list of strings. Extract the 'score' column. This is column 7 in the 0-based column indexing. *OBS*: x.fields[7] is a string. ### pybedtools cleanup V1: deletes all pybedtools session files [does not work - see below] ### KEEP THIS AS A WIKI/EXPLANATION ### REF 1 Pybedtools Design principles: https://daler.github.io/pybedtools/topical-design-principles.html ### REF 2 https://daler.github.io/pybedtools/autodocs/pybedtools.helpers.cleanup.html#pybedtools.helpers.cleanup # Using BedTool instances typically has the side effect of creating temporary files on disk: every BedTools operation results in a new temporary file. # Temporary files may be created in order to run BEDTools programs, so BedTool objects must always point to a file on disk. # Temporary files are stored in /tmp by default, and have the form /tmp/pybedtools.*.tmp. # By default, at exit all temp files created during the session will be deleted. # However, if Python does not exit cleanly (e.g., from a bug in client code), then the temp files will not be deleted. # print("CHR={} | annotation={}, #{}/#{}. Doing pybedtools cleanup...".format(chromosome, name_annotation, counter, len(dict_of_beds))) # pybedtools.cleanup(verbose=True) # force deletion of all temp files from the current session. # ---> YOU CANNOT CLEAN UP FILES at this point because it REMOVES ALL tmp files in dict_of_beds. # ---> e.g. you get the execption: pybedtools.cbedtools.BedToolsFileError: /tmp/pybedtools.izu3ifzg.tmp does not exist ### pybedtools cleanup V2: deletes current annotbed (specific to a chromosome and annotation) # we need to cleanup files because a lot of tmp bed files are written to pybedtools.get_tempdir(). # tmp bed files can take up to 200 MB per file. The file size is dependent on the number of genes in the annotation. # So inputs with "raw SEMs" annotations where each annotation contains all genes in the dataset (all genes have a non-zero SEM value) will generate large tmp bed files. # >>900 GB storage is used if running 2-4 parallel processes of make_annot_file_per_chromosome() and ~1500 annotations # Summary of storage use for this function if not doing forced clean-up : N_files = N_annotations * N_parallel_procs. e.g. 1500 annotations * 4 proc * 0.2 GB per file = 1200 GB # By default, tmp bed files would only cleaned up after completing this function. # OUR SOLUTION: after doing the Bedtools intersect opertation, we no longer need the tmp file (the annotbed object lives in python memory). Force removal of the tmp bed file specific to a chromosome and annotation # NOTE: deleting a tmp file will not cause any problems later on for pybedtools automatic cleanup. I check the source code. os.remove(annotbed.fn) ### Create data frame df_annot_overlap_bp = pd.DataFrame({ 'BP': bp, name_annotation: annotation_value }) # FINUCANE ORIG: df_int = pd.DataFrame({'BP': bp, 'ANNOT':1}) # BP blue # 0 34605531 1 # 1 34605604 1 # 2 34605644 1 # 3 34605778 1 # 4 34606634 1 # 5 34606840 1 # 6 34607223 1 df_annot = pd.merge( df_bim, df_annot_overlap_bp, how='left', on='BP' ) # *IMPORTANT*: how='left' --> resulting data frame will include ALL snps from the bim file. # ^ how="left": use only keys from left frame, PRESERVE KEY ORDER # (Pdb) df_annot.head() # CHR SNP CM BP blue # 0 21 rs146134162 -0.908263 9412099 NaN # 1 21 rs578050168 -0.908090 9412377 NaN # 2 21 rs527616997 -0.907297 9413645 NaN # 3 21 rs544748596 -0.906578 9414796 NaN # 4 21 rs528236937 -0.906500 9414921 NaN df_annot = df_annot[[ name_annotation ]] # get rid of all columns but the name_annotation. Important: return 1 column data frame (and not series, which would loose the column name) # df[[name_annotation]] or df.loc[:, [name_annotation]] --> returns dataframe # df[name_annotation] or df.loc[:, name_annotation] --> returns series df_annot.fillna( 0, inplace=True ) # SNPs not in df_annot_overlap_bp will have NA values in name_annotation # Do data type conversion AFTER .fillna() to avoid problems with NA (float) that cannot be converted to int. df_annot[name_annotation] = df_annot[name_annotation].astype(float) list_df_annot.append(df_annot) if annot_per_geneset == True: # write annot file per annotation per chromosome file_out_annot = "{}/{}.{}.{}.annot.gz".format( out_dir, out_prefix, name_annotation, chromosome) # set output filename. ${prefix}.${chr}.annot.gz df_annot.to_csv(file_out_annot, sep="\t", index=False, compression="gzip") counter += 1 # if counter == 4: break print("CHR={} | Concatenating annotations...".format(chromosome)) df_annot_combined = pd.concat( list_df_annot, axis='columns' ) # stack horizontally (there is no joining on indexes, just stacking) # *IMPORTANT*: since we did pd.merge(df_bin, df_annot_overlap_bp) with how='left' the know that ALL dfs in list_df_annot have ALL SNPs in df_bim and the order of the SNPs are preserved. # ALTERNATIVELY if you don't want 'thin-annot' use this (i.e. adding 'CHR','SNP','CM','BP' columns): df_annot_combined = pd.concat([df_bim]+list_df_annot, axis='columns') # stack horizontally # print("CHR={} | Calculating standard deviation for annotations...".format(chromosome)) # df_annot_sd = pd.DataFrame(df_annot_combined.drop(columns=["CHR", "SNP", "CM", "BP"]).std(), columns=["sd"]) # df_annot_sd.index.name = "annotation" # df_annot_sd["n"] = df_annot.shape[1] # number of SNPs in the data frame. This makes it easier to calculate the combined standard deviation across chromosomes later. # file_out_annot_combined_sd = "{}/{}.{}.{}.annot_sd".format(args.out_dir, args.out_prefix, "COMBINED_ANNOT", chromosome) # df_annot_sd.to_csv(file_out_annot_combined_sd, sep="\t", index=True) ### Output file ### annotation sd n ### antiquewhite3 0.16847050545855485 5 ### blue1 0.1197907423131066 5 ### chocolate 0.0 5 print("CHR={} | Writing annotations...".format(chromosome)) if all_genes == True: file_out_annot_combined = "{}/all_genes_in_{}.{}.annot.gz".format( out_dir, out_prefix, chromosome) else: file_out_annot_combined = "{}/{}.{}.{}.annot.gz".format( out_dir, out_prefix, "COMBINED_ANNOT", chromosome) df_annot_combined.to_csv(file_out_annot_combined, sep="\t", index=False, compression="gzip") return None
def test_cleaned_intersect(): x = pybedtools.BedTool( """ chr1 1 10 1 chr1 20 30 2 chr1 100 120 3 """, from_string=True, ) y = pybedtools.BedTool( """ chr1 2 7 4 chr1 110 120 5 chr1 200 210 6 """, from_string=True, ) z = pybedtools.BedTool( """ chr1 25 40 7 chr1 190 205 8 chr1 1000 1001 9 """, from_string=True, ) # Two-way test # x2, y2 = pybedtools.contrib.venn_maker.cleaned_intersect([x, y]) # x should be the same -- 1, 2, 3 # y should have 1, 3, 6 assert x2 == fix(""" chr1 1 10 chr1 20 30 chr1 100 120 """) assert y2 == fix(""" chr1 1 10 chr1 100 120 chr1 200 210""") # Three-way test # x3, y3, z3 = pybedtools.contrib.venn_maker.cleaned_intersect([x, y, z]) # x should be the same -- 1, 2, 3 # y should have 1, 3, 6 # z should have 2, 6 assert x3 == fix(""" chr1 1 10 chr1 20 30 chr1 100 120 """) assert y3 == fix(""" chr1 1 10 chr1 100 120 chr1 200 210""") assert z3 == fix(""" chr1 20 30 chr1 200 210 chr1 1000 1001""") try: pybedtools.helpers._check_for_R() print( pybedtools.contrib.venn_maker.venn_maker( beds=[x, y, z], names=["x", "y", "z"], figure_filename="out.tiff", additional_args=[ "euler.d=TRUE", "scaled=TRUE", 'fill=c("red","blue", "orange")', ], run=True, )) except ValueError: sys.stderr.write("R installation not found; skipping test") if os.path.exists("out.tiff"): os.unlink("out.tiff")
def get_feature_locations(self, limit_genes=False, flush_cashe=False): """ Gets locations of genic features, five prime sites, 3 prime sites, poly a sites stop codons start codons and tss based off annotated gtf _db file _db - _db handle generated by gtf utils returns dict of bedfiles { five_prime_ends : bedtool three_prime_ends poly_a_sites stop_codons transcription_start_sites } """ transcriptome = { "five_prime_ends" : [], "three_prime_ends" : [], "poly_a_sites" : [], "stop_codons" : [], "start_codons" : [], "transcription_start_sites" : []} region_and_species = os.path.join(self._regions_dir, self._species) try: if flush_cashe: raise ValueError return {region : pybedtools.BedTool("%s_%s.bed" % (region_and_species, region)) for region in transcriptome} except ValueError: pass for i, gene_id in enumerate(self._db.features_of_type('gene')): if i % 2000 == 0: print "processed %d genes" % (i) if i == 2000 and limit_genes: break gene = { "five_prime_ends": [], "three_prime_ends": [], "poly_a_sites": [], "stop_codons": [], "start_codons": [], "transcription_start_sites": []} try: for exon in self._db.children(gene_id, featuretype='exon'): exon_start = copy.deepcopy(exon) exon_start.start = exon.start + 1 exon_stop = copy.deepcopy(exon) exon_stop.start = exon_stop.stop exon_stop.stop += 1 if exon.strand == "-": exon_start, exon_stop = exon_stop, exon_start gene['five_prime_ends'].append(exon_start) gene['three_prime_ends'].append(exon_stop) #transcript vs mRNA need to look at the difference for transcript in self._db.children(gene_id, featuretype=self._feature_names['transcript']): transcript_start = copy.deepcopy(transcript) transcript_start.stop = transcript.start + 1 transcript_stop = copy.deepcopy(transcript) transcript_stop.start = transcript_stop.stop transcript_stop.stop += 1 if transcript.strand == "-": transcript_start, transcript_stop = transcript_stop, transcript_start gene['poly_a_sites'].append(transcript_stop) gene['transcription_start_sites'].append(transcript_start) if self._species == "ce10": #need to generalize later for transcript in self._db.children(gene_id, featuretype=self._feature_names['transcript']): try: cds = list(self._db.children(transcript, featuretype='CDS')) first_cds, last_cds = cds[0], cds[-1] if first_cds.strand == '-': first_cds, last_cds = last_cds, first_cds start_codon = first_cds start_codon.stop = first_cds.start + 1 gene['start_codons'].append(start_codon) stop_codon = last_cds stop_codon.start = stop_codon.stop stop_codon.stop = stop_codon.stop + 1 gene['stop_codons'].append(stop_codon) except: pass else: #for hg19 and mm9 gencode for start_codon in self._db.children(gene_id, featuretype='start_codon'): start_codon.stop = start_codon.start + 1 gene['start_codons'].append(start_codon) for stop_codon in self._db.children(gene_id, featuretype='stop_codon'): stop_codon.start = stop_codon.stop stop_codon.stop = stop_codon.stop + 1 gene['stop_codons'].append(stop_codon) except IndexError: pass gene_id = gene_id.attributes[self._feature_names['gene_id']] for region in gene: transcriptome[region] += self._merge_and_rename_regions(gene[region], gene_id) for name, intervals in transcriptome.items(): transcriptome[name] = pybedtools.BedTool(map(self._to_bed, intervals)).\ remove_invalid().sort().each(self._fix_chrom).saveas("%s_%s.bed" % (region_and_species, name)) return transcriptome
def get_genomic_regions(self, prox_size=500, limit_genes=False, flush_cashe=False): """ returns bedtool of all non-overlapping regions in the genome, exons, cds, 3' utrs and 5' utrs _species - string of the _species to analyze _db - _db handle generated by gtf utils Potental off by one bug here, need to examine more closely """ region_and_species = os.path.join(self._regions_dir, self._species) regions = ["genes", "five_prime_utrs", "three_prime_utrs", "cds", "exons", "introns", "proxintron", "distintron", ] try: if flush_cashe: raise ValueError results = {} for region in regions: if region in ["proxintron", "distintron"]: results[region] = pybedtools.BedTool("%s_%s%d.bed" % (region_and_species, region, prox_size)) else: results[region] = pybedtools.BedTool("%s_%s.bed" % (region_and_species, region)) return results except ValueError as e: print e pass three_prime_utrs = [] five_prime_utrs = [] cds = [] exons = [] dist_introns = [] prox_introns = [] gene_list = [] introns = [] for i, gene in enumerate(self._feature_hash.keys()): gene_list.append(self._feature_hash[gene]['gene']) if i % 2000 == 0: print "processed %d genes" % (i) if i == 2000 and limit_genes: break gene_cds, gene_dist_introns, gene_exons, gene_five_prime_utrs, gene_introns, gene_prox_introns, gene_three_prime_utrs = self._gene_regions(gene) three_prime_utrs += gene_three_prime_utrs five_prime_utrs += gene_five_prime_utrs cds += gene_cds exons += gene_exons dist_introns += gene_dist_introns prox_introns += gene_prox_introns introns += gene_introns #make exons and introns results = {"genes": gene_list, "five_prime_utrs": five_prime_utrs, "three_prime_utrs": three_prime_utrs, "cds": cds, "proxintron": prox_introns, "distintron": dist_introns, "exons": exons, "introns": introns} for name, intervals in results.items(): intervals = pybedtools.BedTool(map(self._to_bed, intervals)).remove_invalid().sort().each(self._fix_chrom) if name in ["proxintron", "distintron"]: results[name] = intervals.saveas(region_and_species + "_%s%d.bed" % (name, prox_size)) else: results[name] = intervals.saveas(region_and_species + "_%s.bed" % (name)) return results
def make_normalization(segmentation, normalization): """ Make normalization file for RNAmaps (for given segmentation). Parameters ---------- segmentation : str Segmentation file. normalization : str Output txt file with normalization. Returns ------- str Path to file with normalizations. """ iCount.logger.log_inputs(LOGGER) data = {} # Container for normalization data def add_entry(start_type, stop_type, start_len, stop_len, strand): """Add normalization entry in ``data``.""" if strand == '-': start_type, stop_type = stop_type, start_type start_len, stop_len = stop_len, start_len # Cut long segments to some managable size: start_len = start_len if start_len < RNA_WINDOW_SIZE else RNA_WINDOW_SIZE stop_len = stop_len if stop_len < RNA_WINDOW_SIZE else RNA_WINDOW_SIZE rna_map_type = '{}-{}'.format(start_type, stop_type) # Left side: segments = data.setdefault(rna_map_type, {}).setdefault(-start_len, 0) data[rna_map_type][-start_len] = segments + 1 # Right side: segments = data.setdefault(rna_map_type, {}).setdefault(stop_len - 1, 0) data[rna_map_type][stop_len - 1] = segments + 1 LOGGER.info('Reading segmentation to internal format...') # pylint: disable=protected-access chroms = set() for segment in pybedtools.BedTool(segmentation): chroms.add(segment.chrom) chroms_strands = [(chrom, strand) for chrom in chroms for strand in ('+', '-')] for (chrom, strand) in chroms_strands: LOGGER.debug("Processing chromosome %s...", chrom) last_intergenic = None # Store last intergenic segment. last_segments = [ ] # Store segments with highest stop coordinate (can be more of them). chrom_content = iCount.genomes.segment._prepare_segmentation( segmentation, chrom, strand=strand) # Iter through all genes in given chromosome/strand sorted by start position: for gene_content in sorted(chrom_content.values(), key=lambda x: x['gene_segment'].start): gene_segment = gene_content.pop('gene_segment') # In case, intergenic region if found, add entries from all # segments that stop where intergenic starts. if gene_segment[2] == 'intergenic': last_intergenic = gene_segment for seg in last_segments: add_entry(seg[2], 'integrenic', len(seg), len(gene_segment), strand) else: # Iterate by ascending transcript coordinate: for transcript_content in sorted(gene_content.values(), key=lambda x: x[0].start): transcript_segment = transcript_content.pop(0) # Update list "last_segments", if necessary: if not last_segments or last_segments[ 0].stop < transcript_segment.stop: last_segments = [transcript_content[-1]] elif last_segments[0].stop == transcript_segment.stop: last_segments.append(transcript_content[-1]) # If transcript starts where intergenic ends, add also entry for this: if last_intergenic.stop == transcript_content[0].start: add_entry('integrenic', transcript_content[0][2], len(last_intergenic), len(transcript_content[0]), strand) # This is the "normal" case - add entries for all segments in transcript: for seg1, seg2 in zip(transcript_content, transcript_content[1:]): add_entry(seg1[2], seg2[2], len(seg1), len(seg2), strand) # Consider also exon-exon junctions: exons = [ seg for seg in transcript_content if seg[2] in EXON_TYPES ] if len(exons) > 1: for exon1, exon2 in zip(exons, exons[1:]): add_entry(exon1[2], exon2[2], len(exon1), len(exon2), strand) # Data must be transformed: Consider all segment length for normalization, not just the last # nucleotide. Example: # data_before = {-10, :1, -5: 1, 10: 2} # data_after = {-10: 1, -9: 1 ... -6: 1, -5: 2, -4: 2 ... -1: 2, 0: 2, 1: 2 ... 9: 2, 10: 2} LOGGER.info('Flattening normalization data...') for rna_map_type, distances in data.items(): cumulative = 0 for i in range(min(distances.keys()), 0): cumulative += data[rna_map_type].get(i, 0) data[rna_map_type][i] = cumulative cumulative = 0 for i in range(max(distances.keys()) + 1)[::-1]: cumulative += data[rna_map_type].get(i, 0) data[rna_map_type][i] = cumulative # Write to file: LOGGER.info('Writing normalization to file') with open(normalization, 'wt') as nfile: print('\t'.join(['RNAmap_type', 'distance', 'segments']), file=nfile) for rna_map_type, distances in sorted(data.items()): for distance, segments in sorted(distances.items()): print('\t'.join(map(str, [rna_map_type, distance, segments])), file=nfile)
output_file = sys.argv[3].strip() # sample name sample = sys.argv[4].strip() bands = [] with open(cyto_bed, "r") as f: for line in f: line_arr = line.replace("chr", "").strip().split() bands.append(line_arr[0]+line_arr[3]) df = pd.DataFrame(bands, columns=["cytoBand"]) df = df.set_index("cytoBand") df[sample] = 0.0 a = pybedtools.BedTool(cyto_bed) b = pybedtools.BedTool(seg_file) a.intersect(b, wao=True).saveas("intersected_seg_file.cns") data = dict() with open("intersected_seg_file.cns", "r") as f: for line in f: line_arr = line.strip().split() # Get the cytoband and seg part key = "\t".join(line_arr[:5]).replace("chr", "") value = "\t".join(line_arr[5:]) # If a cytoband has more than one segment, we can access it because # it is being stored as a list; which works even if there's only one if key in data:
def tophat_map(gtf, out_dir, prefix, fastq, thread, bw=False, scale=False, gtf_flag=1): ''' 1. Map reads with TopHat2 2. Extract unmapped reads 3. Create BigWig file if needed ''' # tophat2 mapping print('Map reads with TopHat2...') tophat_cmd = 'tophat2 -g 1 --microexon-search -m 2 ' if gtf_flag: tophat_cmd += '-G %s ' % gtf tophat_cmd += '-p %s -o %s ' % (thread, out_dir + '/tophat') tophat_cmd += '%s/bowtie2_index/%s ' % (out_dir, prefix) + ','.join(fastq) tophat_cmd += ' 2> %s/tophat.log' % out_dir print('TopHat2 mapping command:') print(tophat_cmd) return_code = os.system(tophat_cmd) >> 8 if return_code: sys.exit('Error: cannot map reads with TopHat2!') # extract unmapped reads print('Extract unmapped reads...') unmapped_bam = pybedtools.BedTool('%s/tophat/unmapped.bam' % out_dir) unmapped_bam.bam_to_fastq(fq='%s/tophat/unmapped.fastq' % out_dir) # create Bigwig file if needed if bw and which('bedGraphToBigWig') is not None: print('Create BigWig file...') map_bam_fname = '%s/tophat/accepted_hits.bam' % out_dir # index bam if not exist if not os.path.isfile(map_bam_fname + '.bai'): pysam.index(map_bam_fname) map_bam = pysam.AlignmentFile(map_bam_fname, 'rb') # extract chrom size file chrom_size_fname = '%s/tophat/chrom.size' % out_dir with open(chrom_size_fname, 'w') as chrom_size_f: for seq in map_bam.header['SQ']: chrom_size_f.write('%s\t%s\n' % (seq['SN'], seq['LN'])) if scale: # scale to HPB mapped_reads = map_bam.mapped for read in map_bam: read_length = read.query_length break s = 1000000000.0 / mapped_reads / read_length else: s = 1 map_bam = pybedtools.BedTool(map_bam_fname) bedgraph_fname = '%s/tophat/accepted_hits.bg' % out_dir with open(bedgraph_fname, 'w') as bedgraph_f: for line in map_bam.genome_coverage(bg=True, g=chrom_size_fname, scale=s, split=True): value = str(int(float(line[3]) + 0.5)) bedgraph_f.write('\t'.join(line[:3]) + '\t%s\n' % value) bigwig_fname = '%s/tophat/accepted_hits.bw' % out_dir return_code = os.system( 'bedGraphToBigWig %s %s %s' % (bedgraph_fname, chrom_size_fname, bigwig_fname)) >> 8 if return_code: sys.exit('Error: cannot convert bedGraph to BigWig!') else: print('Could not find bedGraphToBigWig, so skip this step!')
def _merge_target_information(samples): metrics_dir = utils.safe_makedir("metrics") out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set( dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list( original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum( len(x) for x in pybedtools.BedTool( dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def get_aligned_reads(self, n_read_limit, passed_cells): samfile = pysam.AlignmentFile(self.in_bam_uniq, "rc") try: r_iterator = samfile.fetch(self.chrom, int(self.start), int(self.end)) except: return None nreads = len([ r_idx for r_idx, x in enumerate(r_iterator) if x.flag in self.strand_flags[self.strand] ]) if nreads > n_read_limit: return self.gene r_iterator = samfile.fetch(self.chrom, int(self.start), int(self.end)) read_dict = { r_idx: _make_dict(x, self.chrom, self.strand, self.gene, r_idx) for r_idx, x in enumerate(r_iterator) if x.flag in self.strand_flags[self.strand] and list(filter(regx1.match, x.to_dict()['tags']))[0].replace( 'BC:Z:', '') in passed_cells } samfile.close() df = [read_dict[r_idx]['r_blocks'] for r_idx in read_dict.keys()] if len(df) == 0: return None pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_reads_blocks.bed' % (self.outdir, self.gene), index=False, sep="\t", header=False) read_bed = pybedtools.BedTool('%s/.tempDir/_%s_reads_blocks.bed' % (self.outdir, self.gene)) tmp = self.ex_bed.intersect(read_bed, wa=True, wb=True) if os.stat(tmp.fn).st_size == 0: return None intersect_all = tmp.to_dataframe() read_idx_list = list(set(intersect_all.iloc[:, 9].values)) ex_coord = ','.join( self.exons.apply(lambda x: '%s-%s' % (x[1], x[2]), axis=1).values) aligned_reads = [ _make_list_aligned_reads2(r_idx, read_dict, intersect_all, ex_coord) for r_idx in read_idx_list ] colnames = [ 'name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar', 'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags', 'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB', 'exon_coordinates' ] self.uniq_aligned_reads = pd.DataFrame( aligned_reads, columns=colnames).drop_duplicates() self.uniq_r_bclist = list( set( self.uniq_aligned_reads.apply(lambda x: '%s+%s' % (x['BC'], x['UB']), axis=1).values)) self.uniq_aligned_reads.insert(19, 'MapFlag', 'unique') return None
def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ensemble_tsv, tumor_bam, min_len, postprocess_max_dist, long_read, lr_pad, lr_chunck_size, lr_chunck_scale, lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, pass_threshold, lowqual_threshold, msa_binary, num_threads): logger = logging.getLogger(postprocess.__name__) logger.info("----------------------Postprocessing-----------------------") if not os.path.exists(work): os.mkdir(work) candidates_preds = os.path.join(work, "candidates_preds.vcf") ensembled_preds = os.path.join(work, "ensembled_preds.vcf") pred_vcf = pybedtools.BedTool(pred_vcf_file) pred_vcf.window(candidates_vcf, w=5, v=True).saveas(ensembled_preds) pred_vcf.window(candidates_vcf, w=5, u=True).saveas(candidates_preds) logger.info("Extract targets") postprocess_pad = 1 if not long_read else 10 extract_postprocess_targets(candidates_preds, min_len, postprocess_max_dist, postprocess_pad) no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf") target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf") target_bed = os.path.join(work, "candidates_preds.resolve_target.bed") resolved_vcf = os.path.join(work, "candidates_preds.resolved.vcf") logger.info("Resolve targets") if not long_read: resolve_variants(tumor_bam, resolved_vcf, reference, target_vcf, target_bed, num_threads) else: work_lr_indel_realign = os.path.join(work, "work_lr_indel_realign") if os.path.exists(work_lr_indel_realign): shutil.rmtree(work_lr_indel_realign) os.mkdir(work_lr_indel_realign) ra_resolved_vcf = os.path.join(work, "candidates_preds.ra_resolved.vcf") long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, target_bed, reference, num_threads, lr_pad, lr_chunck_size, lr_chunck_scale, lr_snp_min_af, lr_del_min_af, lr_ins_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, msa_binary) resolve_scores(tumor_bam, ra_resolved_vcf, target_vcf, resolved_vcf) all_no_resolve = concatenate_files([no_resolve, ensembled_preds], os.path.join(work, "no_resolve.vcf")) logger.info("Merge vcfs") merged_vcf = os.path.join(work, "merged_preds.vcf") merge_post_vcfs(reference, resolved_vcf, all_no_resolve, merged_vcf, pass_threshold, lowqual_threshold) add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, output_vcf, pass_threshold, lowqual_threshold) logger.info("Output NeuSomatic prediction at {}".format(output_vcf)) logger.info("Postprocessing is Done.") return output_vcf
def get_intersect_bed_ix(reference_bed, query_bed, just_names=True, araport11_file=None): ## here query_bed is either a file or a pandas dataframe ## we can rely on bedops -- very fast and efficient # https://www.biostars.org/p/319840/ if isinstance(query_bed, str): if os.path.isfile(query_bed): queryBed = pybed.BedTool(query_bed) elif isinstance(query_bed, pd.DataFrame): # query_bed.iloc[:,0] = query_bed.iloc[:,0].astype() query_bed.iloc[:, 1] = query_bed.iloc[:, 1].astype(int) query_bed.iloc[:, 2] = query_bed.iloc[:, 2].astype(int) queryBed = pybed.BedTool.from_dataframe(query_bed.iloc[:, [0, 1, 2]]) elif isinstance(query_bed, pybed.bedtool.BedTool): queryBed = query_bed else: raise (NotImplementedError( "either input a bed file or pandas dataframe for query")) if isinstance(reference_bed, str): if os.path.isfile(reference_bed): refBed = pybed.BedTool(reference_bed) elif just_names: reference_bed_df = identify_positions_given_names( reference_bed, araport11_file) refBed = pybed.BedTool.from_dataframe(reference_bed_df.iloc[:, [0, 1, 2]]) elif isinstance(reference_bed, pd.DataFrame): reference_bed.iloc[:, 1] = reference_bed.iloc[:, 1].astype(int) reference_bed.iloc[:, 2] = reference_bed.iloc[:, 2].astype(int) refBed = pybed.BedTool.from_dataframe(reference_bed.iloc[:, [0, 1, 2]]) elif isinstance(reference_bed, pybed.bedtool.BedTool): refBed = reference_bed else: raise (NotImplementedError( "either input a bed file or pandas dataframe for reference")) f_newrefBed = open(refBed.fn + ".new.tmp", 'w') cmd_out = Popen(''' awk '{ print $0 "\t" NR-1 }' ''' + refBed.fn, shell=True, stdout=f_newrefBed) cmd_out.wait() f_newrefBed.close() newRefBed = pybed.BedTool(refBed.fn + ".new.tmp") f_newqueryBed = open(queryBed.fn + ".new.tmp", 'w') cmd_out = Popen(''' awk '{ print $0 "\t" NR-1 }' ''' + queryBed.fn, shell=True, stdout=f_newqueryBed) cmd_out.wait() f_newqueryBed.close() newqueryBed = pybed.BedTool(queryBed.fn + ".new.tmp") ## Just taking first three columns for bedtools unionBed = newRefBed.intersect(newqueryBed, wa=True, wb=True) if unionBed.count() == 0: ## Return if there are no matching lines. return (None) unionBed = unionBed.to_dataframe() unionBed.columns = np.array([ 'ref_chr', 'ref_start', 'ref_end', 'ref_ix', 'query_chr', 'query_start', 'query_end', 'query_ix' ]) return (unionBed) ## third column is the index I added
def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, output_vcf, pass_threshold, lowqual_threshold): merged_vcf = pybedtools.BedTool(merged_vcf) candidates_vcf = pybedtools.BedTool(candidates_vcf) ensemble_candids_vcf = [] if ensemble_tsv: ensemble_candids_vcf = os.path.join(work, "ensemble_candids.vcf") with open(ensemble_tsv) as e_f: with open(ensemble_candids_vcf, "w") as c_f: c_f.write("##fileformat=VCFv4.2\n") c_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" ) for line in e_f: if "T_REF_FOR" in line: header = line.strip().split() chrom_id = header.index("CHROM") pos_id = header.index("POS") ref_id = header.index("REF") alt_id = header.index("ALT") dp_id = header.index("T_DP") ref_fw_id = header.index("T_REF_FOR") ref_rv_id = header.index("T_REF_REV") alt_fw_id = header.index("T_ALT_FOR") alt_rv_id = header.index("T_ALT_REV") continue fields = line.strip().split() chrom = fields[chrom_id] pos = fields[pos_id] ref = fields[ref_id] alt = fields[alt_id] dp = int(fields[dp_id]) ro_fw = int(fields[ref_fw_id]) ro_rv = int(fields[ref_rv_id]) ao_fw = int(fields[alt_fw_id]) ao_rv = int(fields[alt_rv_id]) ro = ro_fw + ro_rv ao = ao_fw + ao_rv af = np.round(ao / float(ao + ro + 0.0001), 4) c_f.write("\t".join( map(str, [ chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join( map(str, ["0/1", dp, ro, ao, af])) ])) + "\n") ensemble_candids_vcf = pybedtools.BedTool(ensemble_candids_vcf) in_candidates = merged_vcf.window(candidates_vcf, w=5) notin_candidates = merged_vcf.window(candidates_vcf, w=5, v=True) in_ensemble = merged_vcf.window(ensemble_candids_vcf, w=5) notin_any = notin_candidates.window(ensemble_candids_vcf, w=5, v=True) chroms_order = get_chromosomes_order(reference=reference) with pysam.FastaFile(reference) as rf: chroms = rf.references scores = {} tags_info = {} for s_e, dd in [0, in_candidates], [1, in_ensemble]: for x in dd: tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]]) scores[tag] = [x[5], x[6], x[7], x[9]] if tag not in tags_info: tags_info[tag] = [] info = x[19].split(":") dp, ro, ao = map(int, info[1:4]) af = float(info[4]) is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14] is_same_type = np.sign(len(x[3]) - len(x[13])) == np.sign( len(x[4]) - len(x[14])) dist = abs(int(x[1]) - int(x[11])) len_diff = abs((len(x[3]) - len(x[13])) - (len(x[4]) - len(x[14]))) tags_info[tag].append( [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af]) fina_info_tag = {} for tag, hits in tags_info.iteritems(): hits = sorted(hits, key=lambda x: x[0:5]) fina_info_tag[tag] = hits[0][5:] for x in notin_any: tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]]) fina_info_tag[tag] = [0, 0, 0, 0] scores[tag] = [x[5], x[6], x[7], x[9]] tags = sorted(fina_info_tag.keys(), key=lambda x: map(int, x.split("-")[0:2])) with open(output_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write("##NeuSomatic Version={}\n".format(__version__)) o_f.write( "##FORMAT=<ID=SCORE,Number=1,Type=Float,Description=\"Prediction probability score\">\n" ) o_f.write( "##FILTER=<ID=PASS,Description=\"Accept as a higher confidence somatic mutation calls with probability score value at least {}\">\n" .format(pass_threshold)) o_f.write( "##FILTER=<ID=LowQual,Description=\"Less confident somatic mutation calls with probability score value at least {}\">\n" .format(lowqual_threshold)) o_f.write( "##FILTER=<ID=REJECT,Description=\"Rejected as a confident somatic mutation with probability score value below {}\">\n" .format(lowqual_threshold)) o_f.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") o_f.write( "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth in the tumor\">\n" ) o_f.write( "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count in the tumor\">\n" ) o_f.write( "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count in the tumor\">\n" ) o_f.write( "##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele fractions of alternate alleles in the tumor\">\n" ) o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for tag in tags: chrom_id, pos, ref, alt = tag.split("-") qual, filter_, score, gt = scores[tag] dp, ro, ao, af = fina_info_tag[tag] info_field = "{};DP={};RO={};AO={};AF={};".format( score, dp, ro, ao, af) gt_field = "{}:{}:{}:{}:{}".format(gt, dp, ro, ao, af) o_f.write("\t".join( map(str, [ chroms[int(chrom_id)], str(pos), ".", ref, alt, qual, filter_, info_field, "GT:DP:RO:AO:AF", gt_field ])) + "\n")
def assign_to_regions(tool, clusters=None, assigned_dir=".", species="hg19", nrand=3, data_dir=DATA_DIR): """ Assigns each cluster to a genic region finally saves all generated bed and fasta files for future analysis... tool - a bed tool (each line represnting a cluster) clusters - name of cluster file (optional) assigned_dir - location to save files in species - str species to segment nrand - int number offsets times to shuffle for null hypothesis """ if clusters is None: clusters, ext = os.path.splitext(os.path.basename(tool.fn)) bedtracks = {} regions, assigned_regions = regions_generator() short_species = species.split("_")[0] if short_species == "GRCh38": short_species = "hg38" for region in regions: bedtracks[region] = pybedtools.BedTool( os.path.join(data_dir, "regions", "%s_%s.bed" % (species, region))) #creates the basics of bed dict bed_dict = {'all': {'rand': {}}} genes = pybedtools.BedTool( os.path.join(data_dir, "regions", "%s_genes.bed" % (species))) offsets = get_offsets_bed12(tool) if tool.field_count() <= 5: tool.sort().merge().saveas() elif 6 <= tool.field_count() < 8: #Hack to get around not having gene name assigned by peak caller, due to overlapping genes this won't be perfect #move_name_real = functools.partial(move_name, original_length=len(tool[0].fields)) #tool = tool.intersect(genes, wo=True, s=True).each(move_name_real).saveas() #fix_strand_ok = functools.partial(fix_strand, warn=False) tool = tool.sort().merge( s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand_v26).saveas() #elif not tool[0][7].isdigit(): # tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).each(fix_name).saveas() else: #Clipper, this is ideal we like this technique tool = tool.sort().merge(s=True, c="4,5,6,7,8", o="collapse,collapse,collapse,min,min").each( fix_strand_v26).saveas() remaining_clusters = adjust_offsets(tool, offsets) # print "There are a total %d clusters I'll examine" % (len(tool)) for region in regions: remaining_clusters, overlapping = intersection(remaining_clusters, b=bedtracks[region]) #if for some reason there isn't a peak in the region skip it if len(overlapping) == 0: # print "ignoring %s " % region continue #sets up bed dict for this region bed_dict[region] = { 'real': overlapping.sort(stream=True).saveas(), 'rand': {} } no_overlapping_count = len(remaining_clusters) overlapping_count = len(bed_dict[region]['real']) # print "For region: %s found %d that overlap and %d that don't" % (region, # overlapping_count, # no_overlapping_count) if 'real' not in bed_dict['all']: bed_dict['all']['real'] = bed_dict[region]['real'] else: bed_dict['all']['real'] = bed_dict['all']['real'].cat( bed_dict[region]['real'], stream=True, postmerge=False).saveas() #saves offsets so after shuffling the offsets can be readjusted offset_dict = get_offsets_bed12(bed_dict[region]['real']) for i in range(nrand): random_intervals = bed_dict[region]['real'].shuffle( genome=short_species, incl=bedtracks[region].fn).sort() random_intervals = fix_shuffled_strand(random_intervals, bedtracks[region].fn) random_intervals = adjust_offsets(random_intervals, offset_dict) bed_dict[region]['rand'][i] = random_intervals.saveas() if i not in bed_dict['all']['rand']: bed_dict['all']['rand'][i] = bed_dict[region]['rand'][i] else: bed_dict['all']['rand'][i] = bed_dict['all']['rand'][i].cat( bed_dict[region]['rand'][i], stream=True, postmerge=False) #if there are no more clusters to assign stop trying if no_overlapping_count == 0: break # print "After assigning %d un-categorized regions" % len(remaining_clusters) if len(remaining_clusters) > 0: bed_dict['uncatagorized'] = { 'real': remaining_clusters.sort(stream=True).saveas() } bed_dict = save_bedtools(bed_dict, clusters, assigned_dir) return bed_dict
def generate_fastafile_frombed(ref, bed): bedfile = pybedtools.BedTool(bed) fasta = pybedtools.BedTool(ref) bedfile = bedfile.sequence(fi=fasta, s=True, name=True) return bedfile.seqfn
def _has_larger_regions(f): return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f))
def main(): args = parser_args(sys.argv[1:]) if args.temp: pybedtools.helpers.set_tempdir(args.temp) if not args.vcf and not args.zippedvcf: print("ERROR. VCF required, please use -v or -vz") if not args.zippedvcf.endswith(".gz"): print( "ERROR. --vz used with non-gzipped file (must end with '.gz')") if args.vcf.endswith(".gz"): print("ERROR. --vz option should be used, gzipped file detected") sys.exit(2) if not args.reference and not args.zippedreference: print("ERROR. Reference fasta required, please use -r or -rz") sys.exit(2) if not os.path.exists(args.output): os.makedirs(args.output) if not os.path.exists(args.intermediate): os.makedirs(args.intermediate) # Go through the exon-boundaries bed file, and generate a new file # called $INTERMED/splice-site.bed. This bed file contains the # organism's canonical splice site coordinates. canonical_splicesites_bedfile = os.path.join(args.intermediate, "splice-site.bed") if os.path.exists(canonical_splicesites_bedfile): print( "Bed file containing splice sites is already generated. Moving on!" ) else: print("Generating a splicing bed file using exon boundaries...") start = time.time() generate_splicingbed_withexonbound(canonical_splicesites_bedfile, args.chrlens, args.bed) end = time.time() print("Finished generating. Time took %s" % (end - start)) # Find the subset of the VCF which contains non-coding variants # use this VCF for NovaSplice calculations. print("Subsetting VCF to noncoding variants and SNPs") start = time.time() pybedtools.cleanup() vcfheader_file = os.path.join(args.output, "vcf-header") if args.vcf: writevcfheader(vcfheader_file, args.vcf, False) vcf = pybedtools.BedTool(args.vcf) else: writevcfheader(vcfheader_file, args.zippedvcf, True) vcf = pybedtools.BedTool(args.zippedvcf) exon_bounds = pybedtools.BedTool(args.bed).sort() subset_vcf_location = os.path.join(args.output, "coding-excludedvariants.vcf.gz") subsetvcf = vcf.intersect(exon_bounds, v=True, sorted=True).filter(filter_snps_only) merge_two_files(subsetvcf, vcfheader_file, subset_vcf_location) end = time.time() print("Finished subsetting. Time took %s" % (end - start)) # We now find the closest upstream/downstream canonical splice sites # non-coding variant in the subsetted VCF print("Finding closest canonical splice sites to each non-coding variant") start = time.time() pybedtools.cleanup() subset_vcf = pybedtools.BedTool(subset_vcf_location) canon_bed = pybedtools.BedTool(canonical_splicesites_bedfile) subset_vcf.closest(canon_bed, D="b", id=True, io=True, output=os.path.join(args.output, "close-up.bed")) subset_vcf.closest(canon_bed, D="b", iu=True, io=True, output=os.path.join(args.output, "close-down.bed")) end = time.time() print("Finished generating. Time took %s" % (end - start)) # DONOR SPECIFIC # For every variant, compute the set of 9 possible donor sites with that variant. The # output is a bed file that has 9 entries for every variant. variantsite_location = os.path.join(args.output, "variant-site-donorsites.bed") print("Generating a variant bed file from vcf...") start = time.time() #generate_variantbedfile_fromvcf(subset_vcf_location, variantsite_location, True, True) generate_variantbedfile_fromclosest( os.path.join(args.output, "close-down.bed"), variantsite_location, True) end = time.time() print("Finished generating. Time took %s" % (end - start)) # Extract the sequences of every variant and store them in a file. Note that this # file is not saved anywhere except $TMP and is solely based on the reference print("Generating a fasta file from variant bed file...") start = time.time() if args.reference: fastaref = generate_fastafile_frombed(args.reference, variantsite_location) else: fastaref = generate_fastafile_frombed(args.zippedreference, variantsite_location) end = time.time() print("Finished generating. Time took %s" % (end - start)) # Mutate the fasta file with the variants found in the vcf file print("Mutating fasta file per SNPs in VCF...") start = time.time() generate_variantfastafile( fastaref, os.path.join(args.output, "variant-site-donorsites.fa")) end = time.time() print("Finished generating. Time took %s" % (end - start)) # Create a dictionary that relates a given variant to the closest upstream canonical # splice site's score print("Scoring canonical and novel splice-sites...") start = time.time() if args.reference: nametoscore = extract_canonical_score( os.path.join(args.output, 'close-down.bed'), args.output, args.reference, True) else: nametoscore = extract_canonical_score( os.path.join(args.output, 'close-down.bed'), args.output, args.zippedreference, True) with open(os.path.join(args.output, args.libraryname), 'w') as out: out.write( "Novel SS\tNovel Score\tScore before variant\tClosest Canonical Score\tLocation of closest canonical ss\n" ) compare_scores(os.path.join(args.output, "variant-site-donorsites.fa"), nametoscore, args.percent, args.output, True, args.libraryname) end = time.time() print("Finished generating. Time took %s" % (end - start)) print( "Finished analysis for donor sites. Now doing same for acceptor sites." ) #ACCEPTOR SITES variantsite_location = os.path.join(args.output, "variant-site-acceptorsites.bed") print("Generating a variant bed file from vcf...") start = time.time() #generate_variantbedfile_fromvcf(subset_vcf_location, variantsite_location, False, True) generate_variantbedfile_fromclosest( os.path.join(args.output, "close-up.bed"), variantsite_location, False) end = time.time() print("Finished generating. Time took %s" % (end - start)) print("Generating a fasta file from variant bed file...") start = time.time() if args.reference: fastaref = generate_fastafile_frombed(args.reference, variantsite_location) else: fastaref = generate_fastafile_frombed(args.zippedreference, variantsite_location) end = time.time() print("Finished generating. Time took %s" % (end - start)) print("Mutating fasta file per SNPs in VCF...") start = time.time() generate_variantfastafile( fastaref, os.path.join(args.output, "variant-site-acceptorsites.fa")) end = time.time() print("Finished generating. Time took %s" % (end - start)) print("Scoring canonical and novel splice-sites...") start = time.time() if args.reference: nametoscore = extract_canonical_score( os.path.join(args.output, 'close-up.bed'), args.output, args.reference, False) else: nametoscore = extract_canonical_score( os.path.join(args.output, 'close-up.bed'), args.output, args.zippedreference, False) compare_scores(os.path.join(args.output, "variant-site-acceptorsites.fa"), nametoscore, args.percent, args.output, False, args.libraryname) end = time.time() print("Finished generating. Time took %s" % (end - start)) print("Finished! Removing intermediate files now...") toRem = [] toRem += (glob.glob(os.path.join(args.output, '*.bed'))) toRem += (glob.glob(os.path.join(args.output, '*.vcf.gz'))) toRem += (glob.glob(os.path.join(args.output, '*.fa'))) for i in (toRem): os.remove(i)
def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, myid=0): thread_logger = logging.getLogger( "%s-%s" % (run_age_single.__name__, multiprocessing.current_process())) bedtools_intervals = [] intervals_bedtool = pybedtools.BedTool(intervals_bed) assembly_fasta = pysam.Fastafile(assembly) reference_fasta = pysam.Fastafile(reference) breakpoints_bed = None thread_logger.info("Will process %d intervals" % (len(region_list))) try: for region in region_list: bedtools_interval = pybedtools.Interval(region[0], region[1], region[3]) matching_intervals = [ interval for interval in intervals_bedtool if (interval.start == bedtools_interval.start and interval.end == bedtools_interval.end and interval.chrom == bedtools_interval.chrom) ] if not matching_intervals: thread_logger.info("Matching interval not found for %s" % (str(bedtools_interval))) matching_interval = bedtools_interval else: matching_interval = matching_intervals[0] thread_logger.info("Matching interval %s" % (str(matching_interval))) if region not in contig_dict: continue if not contig_dict[region]: continue region_object = SVRegion(region[0], region[1], region[2], region[3]) if region_object.pos1 - pad < 0: thread_logger.error( "Region too close to start of chromosome. Skipping.") continue reference_sequence = reference_fasta.fetch( reference=region_object.chrom1, start=region_object.pos1 - pad, end=region_object.pos2 + pad) region_name = "%s.%d.%d" % (region_object.chrom1, region_object.pos1, region_object.pos2) ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name) thread_logger.info("Writing the ref sequence for region %s" % region_name) with open(ref_name, "w") as file_handle: file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence)) age_records = [] thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object))) for contig in contig_dict[region]: thread_logger.info( "Writing the assembeled sequence %s of length %s" % (contig.raw_name, contig.sequence_len)) if contig.sequence_len * region_object.length() >= 100000000: thread_logger.info( "Skipping contig because AGE problem is large") continue contig_sequence = assembly_fasta.fetch(contig.raw_name) prefix = get_age_file_prefix(contig) asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix) out = os.path.join(age_workdir, "%s.age.out" % prefix) err = os.path.join(age_workdir, "%s.age.err" % prefix) with open(asm_name, "w") as file_handle: file_handle.write(">{}.as\n{}".format( region_name, contig_sequence)) age_cmd = "%s %s -both -go=-6 %s %s >%s 2>%s" % ( age, "-inv" if contig.sv_type == "INV" else "-indel", ref_name, asm_name, out, err) execute_cmd = "timeout %ds %s" % (timeout, age_cmd) retcode = run_cmd(execute_cmd, thread_logger, None, None) if retcode == 0: age_record = AgeRecord(out) if len(age_record.inputs) == 2: age_record.contig = contig age_records.append(age_record) else: thread_logger.error( "Number of inputs != 2 in age output file %s. Skipping." % out) if not keep_temp: os.remove(asm_name) os.remove(err) unique_age_records = get_unique_age_records(age_records) thread_logger.info("Unique %d AGE records for region %s" % (len(unique_age_records), str(region_object))) for age_record in unique_age_records: thread_logger.info(str(age_record)) sv_types = list( set([ age_record.contig.sv_type for age_record in unique_age_records ])) if len(sv_types) != 1: thread_logger.error( "Some problem. Mixed SV types for this interval %s" % (str(sv_types))) else: sv_type = sv_types[0] thread_logger.info("Processing region of type %s" % sv_type) breakpoints, info_dict = process_age_records( unique_age_records, sv_type=sv_type, pad=pad) bedtools_fields = matching_interval.fields if len(breakpoints) == 1 and sv_type == "INS": bedtools_fields += map(str, [ breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1] ]) elif len(breakpoints) == 2 and (sv_type == "DEL" or sv_type == "INV"): bedtools_fields += map( str, breakpoints + [breakpoints[1] - breakpoints[0]]) else: bedtools_fields += map( str, [bedtools_fields[1], bedtools_fields[2], -1]) bedtools_fields.append(base64.b64encode(json.dumps(info_dict))) thread_logger.info("Writing out fields %s" % (str(bedtools_fields))) bedtools_intervals.append( pybedtools.create_interval_from_list(bedtools_fields)) if not keep_temp: os.remove(ref_name) except Exception as e: thread_logger.error('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e assembly_fasta.close() reference_fasta.close() thread_logger.info("Writing %d intervals" % (len(bedtools_intervals))) if bedtools_intervals: breakpoints_bed = os.path.join(age_workdir, "%d_breakpoints.bed" % myid) pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed) return breakpoints_bed
def venn_mpl(a, b, c, colors=None, outfn='out.png', labels=None): """ *a*, *b*, and *c* are filenames to BED-like files. *colors* is a list of matplotlib colors for the Venn diagram circles. *outfn* is the resulting output file. This is passed directly to fig.savefig(), so you can supply extensions of .png, .pdf, or whatever your matplotlib installation supports. *labels* is a list of labels to use for each of the files; by default the labels are ['a','b','c'] """ try: import matplotlib.pyplot as plt from matplotlib.patches import Circle except ImportError: sys.stderr.write( 'matplotlib is required to make a Venn diagram with %s\n' % os.path.basename(sys.argv[0])) sys.exit(1) a = pybedtools.BedTool(a) b = pybedtools.BedTool(b) c = pybedtools.BedTool(c) if colors is None: colors = ['r', 'b', 'g'] radius = 6.0 center = 0.0 offset = radius / 2 if labels is None: labels = ['a', 'b', 'c'] circle_a = Circle(xy=(center - offset, center + offset), radius=radius, edgecolor=colors[0], label=labels[0]) circle_b = Circle(xy=(center + offset, center + offset), radius=radius, edgecolor=colors[1], label=labels[1]) circle_c = Circle(xy=(center, center - offset), radius=radius, edgecolor=colors[2], label=labels[2]) fig = plt.figure(facecolor='w') ax = fig.add_subplot(111) for circle in (circle_a, circle_b, circle_c): circle.set_facecolor('none') circle.set_linewidth(3) ax.add_patch(circle) ax.axis('tight') ax.axis('equal') ax.set_axis_off() kwargs = dict(horizontalalignment='center') # Unique to A ax.text(center - 2 * offset, center + offset, str((a - b - c).count()), **kwargs) # Unique to B ax.text(center + 2 * offset, center + offset, str((b - a - c).count()), **kwargs) # Unique to C ax.text(center, center - 2 * offset, str((c - a - b).count()), **kwargs) # A and B not C ax.text(center, center + 2 * offset - 0.5 * offset, str( (a + b - c).count()), **kwargs) # A and C not B ax.text(center - 1.2 * offset, center - 0.5 * offset, str((a + c - b).count()), **kwargs) # B and C not A ax.text(center + 1.2 * offset, center - 0.5 * offset, str((b + c - a).count()), **kwargs) # all ax.text(center, center, str((a + b + c).count()), **kwargs) ax.legend(loc='best') fig.savefig(outfn) plt.close(fig)
print '\\begin{tabular}{ | l | l | l | p{5cm} |}' print '\hline' print 'Sample name & Num. peaks repl1 & Num. peaks repl2 & Num. peaks merged \\\\ \hline \hline' print '\\multicolumn{4}{|c|}{Narrow peaks} \\\\ \hline' for sample_name in samples: # files for replicates repl1 = np + sample_name + '_repl1_FE.bw' repl2 = np + sample_name + '_repl2_FE.bw' # Optional, for corellation on BED files of peaks, two files merge in one bed_repl1 = np + sample_name + '_repl1_summits.bed' bed_repl2 = np + sample_name + '_repl2_summits.bed' bed_merged = np + sample_name + '_merged.bed' bd_rp1 = pb.BedTool(bed_repl1) bd_rp2 = pb.BedTool(bed_repl2) bd_mrg = bd_rp1.cat(bd_rp2) bd_mrg.slop(l=1000, r=1000, genome='mm10').merge().saveas(bed_merged) msg = "\\verb|{}| & {} & {} & {} \\\\ \hline".format( sample_name, str(bd_rp1.count()), str(bd_rp2.count()), str(bd_mrg.count())) print msg print '\hline \\multicolumn{4}{|c|}{Broad peaks} \\\\ \hline' for sample_name in samples: # files for replicates repl1 = bp + sample_name + '_repl1_FE.bw' repl2 = bp + sample_name + '_repl2_FE.bw'
average_very_good_coverage > 0) else 1)) return pybedtools.create_interval_from_list(fields) def add_coverage_information(in_bed, bam): return in_bed.each(partial(annotate_coverage, bam=bam)) pybedtools.set_tempdir(args.tmpdir) with open(args.in_bed, 'r') as f: header = f.readline() header = header.strip() in_bed = pybedtools.BedTool(args.in_bed) out_bed = in_bed logger.info("Initial feature count %d" % (out_bed.count())) if not os.path.isdir(args.tmpdir): os.makedirs(args.tmpdir) bed_fields = header.split('\t') if args.rmask_bed: out_bed = annotate_bed(out_bed, pybedtools.BedTool(args.rmask_bed)) logger.info("Feature count after rmask %d" % (out_bed.count())) bed_fields += ["OVERLAPS_RMASK"] if args.segdups_bed: