def __init__(self, normal_bam_filename, tumor_bam_filename, reference_genome_filename, input_filename_base, segments_bed, min_depth=20, min_bqual=10, min_mqual=10, process_num=1): self.normal_bam_filename = normal_bam_filename self.tumor_bam_filename = tumor_bam_filename self.reference_genome_filename = reference_genome_filename self.input_filename_base = input_filename_base self.segments_bed = segments_bed self.min_depth = min_depth self.min_bqual = min_bqual self.min_mqual = min_mqual self.process_num = process_num self.data = Data()
def __init__(self, normal_bam_filename, tumor_bam_filename, reference_genome_filename, input_filename_base, segments_bed, BICseq_bed_fileName_corrected, pkl_path="", max_copynumber=6, subclone_num=1, baseline_thred_LOH=0.3, baseline_thred_APM=0.01, min_depth=20, min_bqual=10, min_mqual=10, process_num=1): self.normal_bam_filename = normal_bam_filename self.tumor_bam_filename = tumor_bam_filename self.reference_genome_filename = reference_genome_filename self.input_filename_base = input_filename_base self.segments_bed = segments_bed self.BICseq_bed_fileName_corrected = BICseq_bed_fileName_corrected self.pkl_path = pkl_path self.max_copynumber = max_copynumber self.subclone_num = subclone_num self.baseline_thred_LOH = baseline_thred_LOH self.baseline_thred_APM = baseline_thred_APM self.min_depth = min_depth self.min_bqual = min_bqual self.min_mqual = min_mqual print "process_num = {}".format(process_num) self.process_num = process_num self.data = Data()
class MixClone_Converter: def __init__(self, normal_bam_filename, tumor_bam_filename, reference_genome_filename, input_filename_base, segments_bed, BICseq_bed_fileName_corrected, pkl_path="", max_copynumber=6, subclone_num=1, baseline_thred_LOH=0.3, baseline_thred_APM=0.01, min_depth=20, min_bqual=10, min_mqual=10, process_num=1): self.normal_bam_filename = normal_bam_filename self.tumor_bam_filename = tumor_bam_filename self.reference_genome_filename = reference_genome_filename self.input_filename_base = input_filename_base self.segments_bed = segments_bed self.BICseq_bed_fileName_corrected = BICseq_bed_fileName_corrected self.pkl_path = pkl_path self.max_copynumber = max_copynumber self.subclone_num = subclone_num self.baseline_thred_LOH = baseline_thred_LOH self.baseline_thred_APM = baseline_thred_APM self.min_depth = min_depth self.min_bqual = min_bqual self.min_mqual = min_mqual print "process_num = {}".format(process_num) self.process_num = process_num self.data = Data() def convert(self, method, pkl_flag=False): if pkl_flag and self.pkl_path != "": print "load pkl from" print self.pkl_path infile = open(self.pkl_path, 'rb') self.data = pkl.load(infile) infile.close() else: self._load_segments() print "MixClone converter converting" if "auto" == method: self._MCMC_gccorrection() elif "visual" == method: self._visual_gccorrection() sys.stdout.flush() self._get_counts() self._output() self._baseline_selection() data_file_name = self.input_filename_base + '.MixClone.input.pkl' outfile = open(data_file_name, 'wb') pkl.dump(self.data, outfile, protocol=2) outfile.close() def _MCMC_gccorrection(self): """ The interception is irrelevant for correction, set as median MCMCLM only returns the m and c, then correct the data here """ mcmclm = MCMCLM(self.data, 0, self.subclone_num, self.max_copynumber) m, c = mcmclm.run() print "MCMC slope = {}".format(m) self._correct(m, c) def _correct(self, slope, intercept): x = np.array(map(lambda seg: seg.gc, self.data.segments)) y = np.array( map( lambda seg: np.log(seg.tumor_reads_num + 1) - np.log( seg.normal_reads_num + 1), self.data.segments)) K = np.percentile(y, 50) A = slope * x + intercept y_corrected = y - A + K for i in range(len(y_corrected)): self.data.segments[i].tumor_reads_num = np.exp( y_corrected[i] + np.log(self.data.segments[i].normal_reads_num + 1)) - 1 print "gc corrected, with slope = {0}, intercept = {1}".\ format(slope, intercept) def _visual_gccorrection(self): gsp = GCStripePlot(self.data.segments, self.sampleNumber) print "total number: {}".format(self.data.seg_num) # Sampling then linear regression, poor performance # gsp.sampleln([i * 1000 for i in range(1,9)], 100) gsp.plot() # todo trimed x, y position x, y, m, c = gsp.output() print "x, y, m, c" print x, y, m, c self._correct(m, c) def _baseline_selection(self): print "begin baseline selection.." self._get_LOH_frac() self._get_LOH_status() self._get_APM_frac() self._get_APM_status() self._compute_Lambda_S() def _get_APM_status(self): self.data.get_APM_status(self.baseline_thred_APM) def _get_LOH_status(self): self.data.get_LOH_status(self.baseline_thred_LOH, flag_runpreprocess=True) def _compute_Lambda_S(self): print "begin compute lambda s .." self.data.compute_Lambda_S_LOH(self.max_copynumber, self.subclone_num, flag_runpreprocess=True) def _output(self): """Output the parameter for THetA The Upper and Lower Boundaries for normal heuristic The GC corrected interval_count_file """ interval_count_file = open(self.BICseq_bed_fileName_corrected, 'w') interval_count_file.write( "ID\tchrm\tstart\tend\ttumorCount\tnormalCount\tgc\n") for i in range(len(self.data.segments)): ID_i = self.data.segments[i].chrom_idx chrm_i = self.data.segments[i].chrom_name start_i = self.data.segments[i].start end_i = self.data.segments[i].end tumorCount_i = self.data.segments[i].tumor_reads_num normalCount_i = self.data.segments[i].normal_reads_num gc_i = self.data.segments[i].gc interval_count_file.write( "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( ID_i, chrm_i, start_i, end_i, tumorCount_i, normalCount_i, gc_i)) interval_count_file.close() print "GC corrected interval file generated!" sys.stdout.flush() def _load_segmentsn(self): """ :returns: TODO """ normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb') tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb') print 'Loading normalized segments by {0}...'.format(self.segments_bed) sys.stdout.flush() self.data.load_segmentsn(normal_bam, tumor_bam, self.segments_bed) normal_bam.close() tumor_bam.close() def _load_segments(self): normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb') tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb') print 'Loading segments with gc by {0}...'.format(self.segments_bed) sys.stdout.flush() # self.data.load_segments(normal_bam, tumor_bam, self.segments_bed) self.data.load_segmentsn(self.segments_bed) normal_bam.close() tumor_bam.close() def _get_counts(self): seg_num = self.data.seg_num process_num = self.process_num print "process_num = {}".format(process_num) if process_num > seg_num: process_num = seg_num pool = Pool(processes=process_num) args_list = [] for j in range(0, seg_num): seg_name = self.data.segments[j].name chrom_name = self.data.segments[j].chrom_name chrom_idx = self.data.segments[j].chrom_idx start = self.data.segments[j].start end = self.data.segments[j].end args_tuple = (seg_name, chrom_name, chrom_idx, start, end, self.normal_bam_filename, self.tumor_bam_filename, self.reference_genome_filename, self.min_depth, self.min_bqual, self.min_mqual) args_list.append(args_tuple) counts_tuple_list = pool.map(process_by_segment, args_list) for j in range(0, seg_num): paired_counts_j, BAF_counts_j = counts_tuple_list[j] self.data.segments[j].paired_counts = paired_counts_j self.data.segments[j].BAF_counts = BAF_counts_j def _get_LOH_frac(self): self.data.get_LOH_frac() def _get_APM_frac(self): self.data.get_APM_frac()
class BamToDataConverter: def __init__(self, normal_bam_filename, tumor_bam_filename, reference_genome_filename, input_filename_base, segments_bed, min_depth=20, min_bqual=10, min_mqual=10, process_num=1): self.normal_bam_filename = normal_bam_filename self.tumor_bam_filename = tumor_bam_filename self.reference_genome_filename = reference_genome_filename self.input_filename_base = input_filename_base self.segments_bed = segments_bed self.min_depth = min_depth self.min_bqual = min_bqual self.min_mqual = min_mqual self.process_num = process_num self.data = Data() def convert(self): self._load_segments() self._get_counts() self._get_LOH_frac() data_file_name = self.input_filename_base + '.MixClone.input.pkl' outfile = open(data_file_name, 'wb') pkl.dump(self.data, outfile, protocol=2) outfile.close() def _load_segments(self): normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb') tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb') print 'Loading segments by {0}...'.format(self.segments_bed) sys.stdout.flush() self.data.load_segments(normal_bam, tumor_bam, self.segments_bed) normal_bam.close() tumor_bam.close() def _get_counts(self): seg_num = self.data.seg_num process_num = self.process_num if process_num > seg_num: process_num = seg_num pool = Pool(processes = process_num) args_list = [] for j in range(0, seg_num): seg_name = self.data.segments[j].name chrom_name = self.data.segments[j].chrom_name chrom_idx = self.data.segments[j].chrom_idx start = self.data.segments[j].start end = self.data.segments[j].end args_tuple = (seg_name, chrom_name, chrom_idx, start, end, self.normal_bam_filename, self.tumor_bam_filename, self.reference_genome_filename, self.min_depth, self.min_bqual, self.min_mqual) args_list.append(args_tuple) counts_tuple_list = pool.map(process_by_segment, args_list) for j in range(0, seg_num): paired_counts_j, BAF_counts_j = counts_tuple_list[j] self.data.segments[j].paired_counts = paired_counts_j self.data.segments[j].BAF_counts = BAF_counts_j def _get_LOH_frac(self): self.data.get_LOH_frac()