def test_cleanup(): """ make sure the tempdir and cleanup work """ assert os.path.abspath(pybedtools.get_tempdir()) == os.path.abspath('.') # make a fake tempfile, not created during this pybedtools session testfn = 'pybedtools.TESTING.tmp' os.system('touch %s' % testfn) assert os.path.exists(testfn) # make some temp files a = pybedtools.BedTool(os.path.join(testdir, 'data', 'a.bed')) b = pybedtools.BedTool(os.path.join(testdir, 'data', 'b.bed')) c = a.intersect(b) # after standard cleanup, c's fn should be gone but the fake one still # there... pybedtools.cleanup(verbose=True) assert os.path.exists(testfn) assert not os.path.exists(c.fn) # Unless we force the removal of all temp files. pybedtools.cleanup(remove_all=True) assert not os.path.exists(testfn) # a.fn and b.fn better be there still! assert os.path.exists(a.fn) assert os.path.exists(b.fn)
def test_stream(): """ Stream and file-based equality, both whole-file and Interval by Interval """ a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.intersect(b) # make an unwriteable dir... orig_tempdir = pybedtools.get_tempdir() if os.path.exists('unwriteable'): os.system('rm -rf unwriteable') os.system('mkdir unwriteable') os.system('chmod -w unwriteable') # ...set that to the new tempdir pybedtools.set_tempdir('unwriteable') # this should really not be written anywhere d = a.intersect(b, stream=True) assert_raises(NotImplementedError, c.__eq__, d) d_contents = d.fn.read() c_contents = open(c.fn).read() assert d_contents == c_contents # reconstruct d and check Interval-by-Interval equality pybedtools.set_tempdir('unwriteable') d = a.intersect(b, stream=True) for i,j in zip(c, d): assert str(i) == str(j) # Now do something similar with GFF files. a = pybedtools.example_bedtool('a.bed') f = pybedtools.example_bedtool('d.gff') # file-based pybedtools.set_tempdir(orig_tempdir) g1 = f.intersect(a) # streaming pybedtools.set_tempdir('unwriteable') g2 = f.intersect(a, stream=True) for i,j in zip(g1, g2): assert str(i) == str(j) # this was segfaulting at one point, just run to make sure g3 = f.intersect(a, stream=True) for i in iter(g3): print i for row in f.cut(range(3), stream=True): row[0], row[1], row[2] assert_raises(IndexError, row.__getitem__, 3) pybedtools.set_tempdir(orig_tempdir) os.system('rm -fr unwriteable')
def cast(bedx, new_type): allowed_types = gqltypes.flat_bed_types if not type(bedx) in allowed_types: raise ToolsException('Type mismatch in CAST. ' +\ bedx.name + ' not supported.',\ 'cast') if type(bedx) == gqltypes.BED12 and \ not new_type in \ (gqltypes.BED3,gqltypes.BED4,gqltypes.BED6,gqltypes.BED12): raise ToolsException (\ 'Type mismatch in CAST. Cannot cast from ' + \ bedx.name + ' to ' + new_type.name,\ 'cast') elif type(bedx) == gqltypes.BED6 and \ not new_type in \ (gqltypes.BED3,gqltypes.BED4,gqltypes.BED6): raise ToolsException (\ 'Type mismatch in CAST. Cannot cast from ' + \ bedx.name + ' to ' + new_type.name,\ 'cast') elif type(bedx) == gqltypes.BED4 and \ not new_type in \ (gqltypes.BED3,gqltypes.BED4): raise ToolsException (\ 'Type mismatch in CAST. Cannot cast from ' + \ bedx.name + ' to ' + new_type.name,\ 'cast') elif type(bedx) == gqltypes.BED3 and \ not new_type in \ (gqltypes.BED3,gqltypes.BED3): raise ToolsException (\ 'Type mismatch in CAST. Cannot cast from ' + \ bedx.name + ' to ' + new_type.name,\ 'cast') start_range = 0 end_range = new_type.cols new_file_name = get_temp_file_name(pybedtools.get_tempdir(), \ 'cast', \ 'tmp') new_file = new_type(new_file_name, True) add_tmp_file(new_file) in_file = open(bedx.val, 'r') out_file = open(new_file_name, 'w') for line in in_file: cols = line.rstrip().split('\t') out_file.write("\t".join(cols[start_range:end_range]) + "\n") in_file.close() out_file.close() return new_file
def test_stream(): """ Stream and file-based equality, both whole-file and Interval by Interval """ a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.intersect(b) # make an unwriteable dir... orig_tempdir = pybedtools.get_tempdir() if os.path.exists('unwriteable'): os.system('rm -rf unwriteable') os.system('mkdir unwriteable') os.system('chmod -w unwriteable') # ...set that to the new tempdir pybedtools.set_tempdir('unwriteable') # this should really not be written anywhere d = a.intersect(b, stream=True) assert_raises(NotImplementedError, c.__eq__, d) d_contents = d.fn.read() c_contents = open(c.fn).read() assert d_contents == c_contents # reconstruct d and check Interval-by-Interval equality pybedtools.set_tempdir('unwriteable') d = a.intersect(b, stream=True) for i, j in zip(c, d): assert str(i) == str(j) # Now do something similar with GFF files. a = pybedtools.example_bedtool('a.bed') f = pybedtools.example_bedtool('d.gff') # file-based pybedtools.set_tempdir(orig_tempdir) g1 = f.intersect(a) # streaming pybedtools.set_tempdir('unwriteable') g2 = f.intersect(a, stream=True) for i, j in zip(g1, g2): assert str(i) == str(j) # this was segfaulting at one point, just run to make sure g3 = f.intersect(a, stream=True) for i in iter(g3): print i for row in f.cut(range(3), stream=True): row[0], row[1], row[2] assert_raises(IndexError, row.__getitem__, 3) pybedtools.set_tempdir(orig_tempdir) os.system('rm -fr unwriteable')
def print_inputs_summary(self): print('VCF\tis:\t', self.vcf_file) print('annof\tis:\t', self.annof) print('outvcfname\tis:\t', self.outvcfname) print('get_inbtw_genes\tis:\t', self.get_genes_btw_bps) print('threads\tis:\t', self.threads) print("pyBedtools Temporary Dir:\t{}".format(str(get_tempdir()))) print("pyBedtools Temporary Dir:\t{}".format(str(self.tempdir)))
def getoptions(): desc = "Intersect BED file with conservation bedGraph files " + \ "and return conservation scores" parser = argparse.ArgumentParser(description=desc) parser.add_argument('bedfile', metavar='BED', nargs=1, help="Input BED file") parser.add_argument('consfiles', metavar='BEDGRAPH', nargs='+', help="Conservation bedGraph files to intersect. " "e.g. could be all chr*.bedGraph.gz files") parser.add_argument('-s', '--summarize', action="store_true", help="Summarize conservation scores by taking the " "average per BED interval [%(default)s]") parser.add_argument('-c', '--cores', type=int, default=4, help="Number of processing cores [%(default)s]") parser.add_argument('-d', '--cores2', type=int, default=2, help="Number of processing cores for summary step. " "[%(default)s]") parser.add_argument('-S', '--splitdir', type=str, default=None, help="Directory to keep intersections separate for " "each conservation file. e.g. chromosome-specific. " "Output won't be written to stdout. [%(default)s]") parser.add_argument('-t', '--temp', type=str, help="set temp directory [{}]".format( pybedtools.get_tempdir())) parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() if args.splitdir and not os.path.exists(args.splitdir): os.makedirs(args.splitdir) return args
def get_intersect_result(bed_pair): A = bed_pair[0] B = bed_pair[1] AB = bed_pair[2] R_file_name = get_temp_file_name(pybedtools.get_tempdir(), \ 'unary_intersect_beds', \ 'tmp') offset = A.cols # if A.name == 'BED3': # offset = 3 # elif A.name == 'BED6': # offset = 6 # if A.name == 'BED12': # offset = 12 curr_line = 1 out_file = open(R_file_name, 'w') in_file = open(AB.val, 'r') for line in in_file: cols = line.rstrip().split('\t') bed6_1 = cols[0:6] bed6_2 = cols[(offset + 0):(offset + 6)] bed6_r = [ bed6_1[0], str( max(int(bed6_1[1]), int(bed6_2[1])) ), str( min(int(bed6_1[2]), int(bed6_2[2])) ), str(0), str(curr_line), bed6_1[5], ] out_file.write("\t".join(bed6_r) + "\n") curr_line+=1 in_file.close() out_file.close() R = gqltypes.BED6(R_file_name, True) add_tmp_file(R) return R
def test_tempfile_management(self): R_file_name = gqltools.get_temp_file_name(pybedtools.get_tempdir(), "unittest", "tmp") r = random.randint(1, sys.maxint) f = open(R_file_name, "w") f.write(str(r)) f.close() # test to see if the file was created self.assertTrue(os.path.isfile(R_file_name)) R = gqltypes.BED6(R_file_name, True) gqltools.add_tmp_file(R) gqltools.clear_tmp_files() self.assertEqual(os.path.isfile(R_file_name), False)
def test_call(): tmp = os.path.join(pybedtools.get_tempdir(), 'test.output') from pybedtools.helpers import call_bedtools, BEDToolsError assert_raises(BEDToolsError, call_bedtools, *(['intersectBe'], tmp)) a = pybedtools.example_bedtool('a.bed') # momentarily redirect stderr to file so the error message doesn't spew all # over the place when testing orig_stderr = sys.stderr sys.stderr = open(a._tmp(), 'w') #assert_raises(BEDToolsError, a.intersect, a=a.fn, b=a.fn, z=True) sys.stderr = orig_stderr pybedtools.set_bedtools_path('nonexistent') a = pybedtools.example_bedtool('a.bed') assert_raises(OSError, a.intersect, a) pybedtools.set_bedtools_path() assert a.intersect(a,u=True) == a
def test_call(): tmp = os.path.join(pybedtools.get_tempdir(), "test.output") from pybedtools.helpers import call_bedtools, BEDToolsError with pytest.raises(BEDToolsError): call_bedtools(*(["intersectBe"], tmp)) a = pybedtools.example_bedtool("a.bed") # momentarily redirect stderr to file so the error message doesn't spew all # over the place when testing orig_stderr = sys.stderr sys.stderr = open(a._tmp(), "w") sys.stderr = orig_stderr pybedtools.set_bedtools_path("nonexistent") a = pybedtools.example_bedtool("a.bed") with pytest.raises(NotImplementedError): a.intersect(a) pybedtools.set_bedtools_path() a = pybedtools.example_bedtool("a.bed") assert a.intersect(a, u=True) == a
def subset_conservation(bg, bd): """ Perform intersectBed between BED file and chromosome-specific bedGraph file """ # Load bedGraph cons = pybedtools.BedTool(bg) chrom, track = get_chrom_from_file(bg) # Filter bed file chrom_bd = subset_chrom(bd, chrom) if chrom_bd.file_type == 'empty': eprint("Skipping %s" % chrom) fn = None else: assert len(chrom_bd) > 0 outfile = os.path.join(pybedtools.get_tempdir(), 'pybedtools.%s.%s.tmp' % (chrom, track)) inter = chrom_bd.intersect(cons, wo=True, sorted=True, output=outfile) fn = inter.fn os.remove(chrom_bd.fn) return fn
def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, scan_window_size, scan_maf, min_mapq, min_dp, max_dp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, truth_vcf, tsv_batch_size, matrix_width, matrix_base_pad, min_ev_frac_per_col, ensemble_tsv, long_read, restart, first_do_without_qual, filter_duplicate, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) logger.info("----------------------Preprocessing------------------------") if restart or not os.path.exists(work): os.mkdir(work) original_tempdir = pybedtools.get_tempdir() pybedtmp = os.path.join(work, "pybedtmp_preprocess") if not os.path.exists(pybedtmp): os.mkdir(pybedtmp) pybedtools.set_tempdir(pybedtmp) if not os.path.exists(tumor_bam): logger.error("Aborting!") raise Exception("No tumor BAM file {}".format(tumor_bam)) if not os.path.exists(normal_bam): logger.error("Aborting!") raise Exception("No normal BAM file {}".format(normal_bam)) if not os.path.exists(tumor_bam + ".bai"): logger.error("Aborting!") raise Exception( "No tumor .bai index file {}".format(tumor_bam + ".bai")) if not os.path.exists(normal_bam + ".bai"): logger.error("Aborting!") raise Exception( "No normal .bai index file {}".format(normal_bam + ".bai")) ensemble_bed = None if ensemble_tsv: ensemble_bed = os.path.join(work, "ensemble.bed") logger.info("Extract ensemble info.") if restart or not os.path.exists(ensemble_bed): ensemble_bed = extract_ensemble(work, ensemble_tsv) merge_d_for_short_read = 100 candidates_split_regions = [] dbsnp_regions_q = [] ensemble_beds = [] if not long_read and first_do_without_qual: logger.info("Scan tumor bam (first without quality scores).") work_tumor_without_q = os.path.join(work, "work_tumor_without_q") if restart or not os.path.exists(work_tumor_without_q): os.mkdir(work_tumor_without_q) filtered_candidates_vcf_without_q = os.path.join( work_tumor_without_q, "filtered_candidates.vcf") tumor_outputs_without_q = process_split_region("tumor", work_tumor_without_q, region_bed, reference, mode, tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq, filtered_candidates_vcf_without_q, min_dp, max_dp, filter_duplicate, good_ao, min_ao, snp_min_af, -10000, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, scan_alignments_binary, restart, num_threads, calc_qual=False, dbsnp_regions=[]) tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q, dbsnp_regions_q = tumor_outputs_without_q if ensemble_tsv: ensemble_beds = get_ensemble_beds( work, reference, ensemble_bed, split_regions, matrix_base_pad, num_threads) candidates_split_regions = extract_candidate_split_regions( work_tumor_without_q, filtered_candidates_vcfs_without_q, split_regions, ensemble_beds, reference, matrix_base_pad, merge_d_for_short_read) work_tumor = os.path.join(work, "work_tumor") if restart or not os.path.exists(work_tumor): os.mkdir(work_tumor) filtered_candidates_vcf = os.path.join( work_tumor, "filtered_candidates.vcf") logger.info("Scan tumor bam (and extracting quality scores).") tumor_outputs = process_split_region("tumor", work_tumor, region_bed, reference, mode, tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq, filtered_candidates_vcf, min_dp, max_dp, filter_duplicate, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, scan_alignments_binary, restart, num_threads, calc_qual=True, regions=candidates_split_regions, dbsnp_regions=dbsnp_regions_q) tumor_counts, split_regions, filtered_candidates_vcfs, _ = tumor_outputs if ensemble_tsv and not ensemble_beds: ensemble_beds = get_ensemble_beds( work, reference, ensemble_bed, split_regions, matrix_base_pad, num_threads) if (not long_read): candidates_split_regions = extract_candidate_split_regions( work_tumor, filtered_candidates_vcfs, split_regions, ensemble_beds, reference, matrix_base_pad, merge_d_for_short_read) if not candidates_split_regions: candidates_split_regions = split_regions work_normal = os.path.join(work, "work_normal") if restart or not os.path.exists(work_normal): os.mkdir(work_normal) logger.info("Scan normal bam (and extracting quality scores).") normal_counts, _, _, _ = process_split_region("normal", work_normal, region_bed, reference, mode, normal_bam, None, scan_window_size, 0.2, min_mapq, None, min_dp, max_dp, filter_duplicate, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, scan_alignments_binary, restart, num_threads, calc_qual=True, regions=candidates_split_regions, dbsnp_regions=[]) work_dataset = os.path.join(work, "dataset") if restart or not os.path.exists(work_dataset): os.mkdir(work_dataset) logger.info("Generate dataset.") for i, (tumor_count, normal_count, filtered_vcf, candidates_split_region) in enumerate(zip(tumor_counts, normal_counts, filtered_candidates_vcfs, candidates_split_regions)): logger.info("Dataset for region {}".format(candidates_split_region)) work_dataset_split = os.path.join(work_dataset, "work.{}".format(i)) if restart or not os.path.exists("{}/done.txt".format(work_dataset_split)): if os.path.exists(work_dataset_split): shutil.rmtree(work_dataset_split) os.mkdir(work_dataset_split) generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, ensemble_beds[i] if ensemble_tsv else None, tsv_batch_size) shutil.rmtree(pybedtmp) pybedtools.set_tempdir(original_tempdir) logger.info("Preprocessing is Done.")
def mergemin_bedn(bednfile): allowed_types = gqltypes.BEDN if not ( type(bednfile) in allowed_types ): raise ToolsException('Type mismatch in MEREGEMIN. ' + bednfile.name + ' not supported.',\ 'mergemin_bedn') pybedtools.settings.KEEP_TEMPFILES=True #relative positions: starts 1, ends 2, name 3, score 4 o_starts = [] o_ends = [] o_names = [] o_scores = [] o_strands = [] bed_types = bednfile.types curr_offset = 0 for bed_type in bed_types: o_starts.append(curr_offset + 1) o_ends.append(curr_offset + 2) if bed_type in (gql_types.BED6, gql_types.BED12): o_names.append(curr_offset + 3) o_scores.append(curr_offset + 4) o_strands.append(curr_offset + 5) if bed_type == gql_types.BED4: o_names.append(curr_offset + 3) curr_offset += bed_type.cols # if bed_type == 'BED3': # curr_offset += 3 # elif bed_type == 'BED4': # curr_offset += 4 # elif bed_type == 'BED6': # curr_offset += 6 # elif bed_type == 'BED12': # curr_offset += 12 # R_file_name = get_temp_file_name(pybedtools.get_tempdir(), \ 'mergemin', \ 'tmp') in_file = open(bednfile.val, 'r') out_file = open(R_file_name, 'w') for line in in_file: cols = line.rstrip().split('\t') out_row = (cols[0], \ str(max( [ int(cols[i]) for i in range(0,len(cols)) \ if i in o_starts] )), \ str(min( [ int(cols[i]) for i in range(0,len(cols)) \ if i in o_ends] )), \ ) # if o_names is empty, then all of the beds were BED3 if len(o_names) > 0 : out_row = out_row + ( \ "::".join([ cols[i] for i in range(0,len(cols)) \ if i in o_names]), \ str(0), \ random.choice([ cols[i] for i in range(0,len(cols)) \ if i in o_strands]), \ ) out_file.write("\t".join(out_row) + "\n") in_file.close() out_file.close() new_type = gqltypes.BED6 if len(o_names) == 0 : new_type = gqltypes.BED3 #new_class = gsltypes.source_type_map(new_type) R = new_type(R_file_name, True) add_tmp_file(R) return R
def filter_bedx(_N_list, filter_opts): pybedtools.settings.KEEP_TEMPFILES=True allowed_types = gqltypes.bed_types N_list = make_mixed_list(_N_list, allowed_types,'FILTER') input_types = [] for bed in N_list: input_types.append(type(bed)) output_type = '' if gqltypes.BED3 in input_types: output_type = gqltypes.BED3 elif gqltypes.BED4 in input_types: output_type = gqltypes.BED4 elif gqltypes.BED6 in input_types: output_type = gqltypes.BED6 elif gqltypes.BED12 in input_types: output_type = gqltypes.BED12 else: raise ToolsException(\ 'Output type could not be determined in FILTER.',\ 'filter_bedx') filter_file_name = get_temp_file_name(pybedtools.get_tempdir(), \ 'filter_bedx', \ 'tmp') filter_bedx=output_type(filter_file_name, True) add_tmp_file(filter_bedx) filter_file = open(filter_bedx.val, 'w') for bed in N_list: f = open(bed.val,'r') bed_type = gqltypes.source_type_map[bed.name] for line in f: cols = line.rstrip().split('\t') keep_line = True for opt in filter_opts: bool_string = "" if not opt in bed_type.col: raise ToolsException(\ 'Invalid field for given filetype ' + \ 'in FOREACH. ' + opt + ' and ' + bed_type.name,\ 'filter_bedx') opt_col = bed_type.col[opt] for test in filter_opts[opt]: if len(test)==2: op=test[0] val=test[1] test=cols[opt_col] if type(val) is str: test='"'+str(test)+'"' result = eval(str(test) + op + str(val)) bool_string = bool_string + str(result) else: bool_string = bool_string + test[0] keep_line = keep_line & eval(bool_string) if keep_line: filter_file.write(line) return filter_bedx
def merge_beds(merge_type, _N_list, merge_opts): pybedtools.settings.KEEP_TEMPFILES=True allowed_types = gqltypes.bed_types N_list = make_mixed_list(_N_list, allowed_types,'MERGE') input_types = [] for bed in N_list: input_types.append(type(bed)) # Parse input arguments and add/modify default argumetns # Default args valid_args = {'distance':'d', \ 'score':'scores', \ 'name':'nms', \ 'stranded':'s'} score_functions = { 'MIN':'min', 'MAX':'max', 'SUM':'sum', \ 'MEAN':'mean', 'MEDIAN':'median', 'MODE':'mode', \ 'ANITMODE':'antimode', 'COLLAPSE':'collapse', 'COUNT':'count'} kwargs = {} for merge_opt in merge_opts: if not ( merge_opt in valid_args ): raise ToolsException('Invalid option in MERGE. ' + \ merge_opt + ' not supported.',\ 'merge_beds') if merge_opt == 'score': if not ( merge_opts[ merge_opt ] in score_functions ) : raise ToolsException(\ 'SCORE funciton not supported by MERGE. ' + \ merge_opts[ merge_opt ], 'merge_beds') else: kwargs[ valid_args[ merge_opt ] ] = \ score_functions[ merge_opts[ merge_opt ] ] elif merge_opt == 'stranded': if (gqltypes.BED3 in input_types ) or \ ( gqltypes.BED4 in input_types) or \ ( gqltypes.BED5 in input_types) : raise ToolsException(\ 'Type mismatch in MERGE. Cannot match by ' + \ 'strand with givne input types',\ 'merge_beds') kwargs[ valid_args[ merge_opt ] ] = True elif (merge_opt == 'distance'): if merge_type == 'flat': raise ToolsException('DISTANCE not supported for MERGEFLAT',\ 'merge_beds') elif merge_type == 'min': raise ToolsException('DISTANCE not supported for MERGEMIN',\ 'merge_beds') elif merge_type == 'max': raise ToolsException('DISTANCE not supported for MERGEMAX',\ 'merge_beds') else: kwargs[ valid_args[ merge_opt ] ] = merge_opts[ merge_opt ] elif (merge_opt == 'name'): if merge_opts[ merge_opt ] == 'COLLAPSE': kwargs[ valid_args[ merge_opt ] ] = True else : raise ToolsException(\ 'NAME funciton not supported by MERGE. ' + \ merge_opts[ merge_opt ], 'merge_beds') output_type = gqltypes.BED3 if (len(kwargs) > 0) : output_type = gqltypes.BED6 # merge the file merge_bed = pybedtools.BedTool() if merge_type == 'merge': #{{{ combine files into one combo_file_name = get_temp_file_name(pybedtools.get_tempdir(), \ 'merge_beds', \ 'tmp') combo_file = open(combo_file_name, 'w') for bed in N_list: in_file = open(bed.val, 'r') for line in in_file: combo_file.write(line) in_file.close() combo_file.close() add_tmp_file(input_type(combo_file_name, True)) # sort the combined file sorted_bed = pybedtools.BedTool(combo_file_name).sort() add_tmp_file(input_type(sorted_bed.fn, True)) #}}} merged_bed = sorted_bed.merge(**kwargs) elif merge_type in ['flat','min','max'] : #{{{ sort each file, make list of files # make sure all the input files are sorted sorted_beds = [] sorted_bed_files = [] for bed in N_list: sorted_bed = pybedtools.BedTool(bed.val).sort() add_tmp_file( eval( 'gqltypes.'+ bed.name + \ '("' + sorted_bed.fn + '",True)' ) ) sorted_beds.append(sorted_bed) sorted_bed_files.append(sorted_bed.fn) kwargs['gql'] = True kwargs['i'] = sorted_bed_files #}}} x = pybedtools.BedTool() if merge_type == 'flat': try: merged_bed = x.multi_intersect(**kwargs) except pybedtools.helpers.BEDToolsError as e: raise ToolsException('Error in MERGE. ' + e.msg,\ 'merge_beds') elif merge_type == 'min': kwargs['cluster'] = True try: merged_bed = x.multi_intersect(**kwargs) except pybedtools.helpers.BEDToolsError as e: raise ToolsException('Error in MERGE. ' + e.msg,\ 'merge_beds') elif merge_type == 'max': kwargs['merge'] = True try: merged_bed = x.multi_intersect(**kwargs) except pybedtools.helpers.BEDToolsError as e: raise ToolsException('Error in MERGE. ' + e.msg,\ 'merge_beds') else: raise ToolsException('Supported by MERGE. ' + merge_type,\ 'merge_beds') result = output_type(merged_bed.fn, True) add_tmp_file(result) return result
def merge_bed_stack(out_stack): out_rows = deque() next_rows = [] next_row_i = 0 offset = 6 while ( len(out_stack) > 0 ): A,B,AB = out_stack.pop() curr_next_rows = [] curr_out_rows = [] curr_row = 1 in_file = open(AB.val, 'r') for line in in_file: curr_row_takes = 0 if len(next_rows) == 0: curr_row_takes = 1 else: while ( (next_row_i < len(next_rows)) and ( next_rows[next_row_i] == curr_row ) ): curr_row_takes += 1 next_row_i += 1 for i in range(0, curr_row_takes): cols = line.rstrip().split('\t') # the score in the first entry give the line number is the # associated file that pairs with the current line curr_next_rows.append(int(cols[4]) ) if len(out_stack) > 0: # the 2nd and on entries have a pointer and a data entry, # pointer entries are BED6, so take all but the first 6 # entries, also the last col is the size of the overlap, # ignore it curr_out_rows.append("\t".join(cols[6:-1])) else: # the first entry tin the stack does not have a pointer # entry, so take both curr_out_rows.append("\t".join(cols[:-1])) curr_row+=1 next_rows = curr_next_rows next_row_i = 0 out_rows.appendleft(curr_out_rows) R_file_name = get_temp_file_name(pybedtools.get_tempdir(), \ 'unary_intersect_beds', \ 'tmp') # check to make sure all the out_rows are the same length same_size = True for i in range(0, len(out_rows) - 1): same_size = same_size and \ ( len(out_rows[i]) == len(out_rows[i + 1]) ) if not same_size: raise ToolsException('Unmached sizes in intersection', \ 'merge_bed_stack') out_file = open(R_file_name, 'w') for i in range(0, len(out_rows[0])): out_line = '' for j in range(0, len(out_rows)): if j != 0: out_line = out_line + '\t' out_line = out_line + (out_rows[j])[i] out_file.write(out_line + '\n') out_file.close() R = gqltypes.BEDN(R_file_name, True) add_tmp_file(R) return R
def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ensemble_tsv, tumor_bam, min_len, postprocess_max_dist, long_read, lr_pad, lr_chunk_size, lr_chunk_scale, lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, pass_threshold, lowqual_threshold, msa_binary, num_threads): logger = logging.getLogger(postprocess.__name__) logger.info("----------------------Postprocessing-----------------------") if not os.path.exists(work): os.mkdir(work) original_tempdir = pybedtools.get_tempdir() pybedtmp = os.path.join(work, "pybedtmp_postprocess") if not os.path.exists(pybedtmp): os.mkdir(pybedtmp) pybedtools.set_tempdir(pybedtmp) candidates_preds = os.path.join(work, "candidates_preds.vcf") ensembled_preds = os.path.join(work, "ensembled_preds.vcf") pred_vcf = pybedtools.BedTool(pred_vcf_file) pred_vcf.window(candidates_vcf, w=5, v=True).saveas(ensembled_preds) pred_vcf.window(candidates_vcf, w=5, u=True).saveas(candidates_preds) logger.info("Extract targets") postprocess_pad = 1 if not long_read else 10 extract_postprocess_targets( candidates_preds, min_len, postprocess_max_dist, postprocess_pad) no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf") target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf") target_bed = os.path.join(work, "candidates_preds.resolve_target.bed") resolved_vcf = os.path.join(work, "candidates_preds.resolved.vcf") logger.info("Resolve targets") if not long_read: resolve_variants(tumor_bam, resolved_vcf, reference, target_vcf, target_bed, num_threads) else: work_lr_indel_realign = os.path.join(work, "work_lr_indel_realign") if os.path.exists(work_lr_indel_realign): shutil.rmtree(work_lr_indel_realign) os.mkdir(work_lr_indel_realign) ra_resolved_vcf = os.path.join( work, "candidates_preds.ra_resolved.vcf") long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, target_bed, reference, num_threads, lr_pad, lr_chunk_size, lr_chunk_scale, lr_snp_min_af, lr_del_min_af, lr_ins_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, msa_binary) resolve_scores(tumor_bam, ra_resolved_vcf, target_vcf, resolved_vcf) all_no_resolve = concatenate_files( [no_resolve, ensembled_preds], os.path.join(work, "no_resolve.vcf")) logger.info("Merge vcfs") merged_vcf = os.path.join(work, "merged_preds.vcf") merge_post_vcfs(reference, resolved_vcf, all_no_resolve, merged_vcf, pass_threshold, lowqual_threshold) add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, output_vcf, pass_threshold, lowqual_threshold) logger.info("Output NeuSomatic prediction at {}".format(output_vcf)) shutil.rmtree(pybedtmp) pybedtools.set_tempdir(original_tempdir) logger.info("Postprocessing is Done.") return output_vcf
def load_file(file_path, filetype_name): # local files are not temp files, but remote files are is_remote = False return_files = [] return_labels = [] # attempt to get the files from the local path files = glob.glob(file_path) if len(files) == 0: # if nothing at the local path, then see if it is a remote path # if so, then fetch the files, store at temp path, and place # the temp file path in the files list is_remote = True #url = 'http://localhost/cgi-bin/name.py?path=' + file_path # retrieve the path from the name server try: url = config['fileserver'] + 'name.py?path=' + file_path json_response = urllib.urlopen(url) s = json_response.read() remote_paths = json.loads(s) json_response.close() except Exception as e: raise ToolsException ('Error retrieving file',\ 'load_file') # fetch remote files for remote_path in remote_paths: tmp_file_path = get_temp_file_name(pybedtools.get_tempdir(), \ 'load', \ 'tmp') # first value is the label return_labels.append(remote_path[0]) # second is the path urllib.urlretrieve(remote_path[1], tmp_file_path) files.append(tmp_file_path) # if there is not remote file at this url, then raise if len(remote_paths) == 0: raise ToolsException (\ 'No file(s) not found at ' + file_path, 'load_file') for f in files: if (filetype_name == 'auto') : type_found = False # loops through the types to see which one matches for source_type in gqltypes.source_types: if source_type.test_filetype(f): type_found = True # if the files is remote, then the temp paramater is true # otherwise it is false new_file = source_type(f, is_remote) if is_remote: add_tmp_file(new_file) else: # remote labels were collected previously return_labels.append(os.path.basename(f)) return_files.append(new_file) if not type_found: raise ToolsException('Unknown filetype for:' + f,'load_file') else: source_type = gqltypes.source_type_map[filetype_name] if source_type.test_filetype(f): # if the files is remote, then the temp paramater is true # otherwise it is false new_file = source_type(f, is_remote) if is_remote: add_tmp_file(new_file) else: # remote labels were collected previously return_labels.append(os.path.basename(f)) return_files.append(new_file) else: raise ToolsException('Filetype mismatch:' + f + \ " does not appear to be " + filetype_name, 'load_file') if len(return_files) == 1: return return_files[0] else: return gqltypes.BEDL(return_files, return_labels)