Esempio n. 1
0
class BamToDataConverter:
    def __init__(self, normal_bam_filename, tumor_bam_filename,
                 reference_genome_filename, input_filename_base, segments_bed,
                 min_depth=20, min_bqual=10, min_mqual=10, process_num=1):
        self.normal_bam_filename = normal_bam_filename
        self.tumor_bam_filename = tumor_bam_filename
        self.reference_genome_filename = reference_genome_filename
        self.input_filename_base = input_filename_base
        self.segments_bed = segments_bed
        
        self.min_depth = min_depth
        self.min_bqual = min_bqual
        self.min_mqual = min_mqual
        self.process_num = process_num
        
        self.data = Data()
        
    def convert(self):
        self._load_segments()
        
        self._get_counts()
        
        self._get_LOH_frac()
        
        data_file_name = self.input_filename_base + '.MixClone.input.pkl'
        outfile = open(data_file_name, 'wb')
        pkl.dump(self.data, outfile, protocol=2)
        
        outfile.close()
        
    def _load_segments(self):
        normal_bam = pysam.Samfile(self.normal_bam_filename, 'rb')
        tumor_bam = pysam.Samfile(self.tumor_bam_filename, 'rb')
        
        print 'Loading segments by {0}...'.format(self.segments_bed)
        sys.stdout.flush()
        self.data.load_segments(normal_bam, tumor_bam, self.segments_bed)
        
        normal_bam.close()
        tumor_bam.close()
        
    def _get_counts(self):
        seg_num = self.data.seg_num
        process_num = self.process_num
                
        if process_num > seg_num:
            process_num = seg_num
        
        pool = Pool(processes = process_num)
        
        args_list = []
        
        for j in range(0, seg_num):
            seg_name = self.data.segments[j].name
            chrom_name = self.data.segments[j].chrom_name
            chrom_idx = self.data.segments[j].chrom_idx
            start = self.data.segments[j].start
            end = self.data.segments[j].end
            
            args_tuple = (seg_name, chrom_name, chrom_idx, start, end, self.normal_bam_filename,
                          self.tumor_bam_filename, self.reference_genome_filename,
                          self.min_depth, self.min_bqual, self.min_mqual)
            
            args_list.append(args_tuple)
            
        counts_tuple_list = pool.map(process_by_segment, args_list)
        
        for j in range(0, seg_num):
            paired_counts_j, BAF_counts_j = counts_tuple_list[j]
            
            self.data.segments[j].paired_counts = paired_counts_j
            self.data.segments[j].BAF_counts = BAF_counts_j    
    
    def _get_LOH_frac(self):
        self.data.get_LOH_frac()