Ejemplo n.º 1
0
    def toAlignmentHit(self, querySequence, contig):
        """Convert this AmosTileMessage to an AlignmentHit object. 
           The message does not contain the read or target sequences, 
           so we need to pass these in. Note this currently does not maintain the target
           sequence."""

        alignedSeq = self.toAlignedSequence(querySequence)

        hit = AlignmentHit()

        hit.query_id = alignedSeq.seqName
        hit.target_id = "%s_contig" % contig['iid']

        clearRange = map(int, self['clr'].split(','))
        hit.query_start = min(clearRange)
        hit.query_end = max(clearRange)

        hit.target_start = alignedSeq.getAlignedStart()
        hit.target_end   = alignedSeq.getAlignedEnd()

        alignedQuery = str(alignedSeq)
        contigAlignedSeq = "A" * (hit.target_end - hit.target_start)

        if clearRange[0] < clearRange[1]:
            hit.target_strand = "+" 
            hit.alignedQuery = alignedQuery
            hit.alignedTarget = contigAlignedSeq
        else:
            hit.target_strand = "-"
            hit.alignedQuery = revcomp(str(alignedQuery))
            hit.alignedTarget = revcomp(contigAlignedSeq)
        hit.query_strand = "+" # for HDF5, query strand is always positive

        return hit
Ejemplo n.º 2
0
    def toAlignedSequence(self, sequence):
        """Convert this AmosTileMessage to an AlignedSequence object. 
           The message does not have the read sequence, so we need to pass it in."""

        clr = map(int, self['clr'].split(','))
        end = max(clr[0], clr[1])

        if clr[1] < clr[0]: 
            # we need to reverse complement the sequence
            sequence = revcomp( sequence )

        alignedSeq = AlignedSequence( sequence )
        off = int(self['off'])
        alignedSeq.setAlignedStart( off  )

        alignedSeq.setAlignedEnd( off + end ) 
        alignedSeq.seqName = self['src']

        gaps = []
        if (self['gap'] != ""):
            gaps = map(lambda x: int(x) - 1, self['gap'].split(' '))
            alignedSeq.insertGaps(gaps, offset=off )
        alignedSeq.setAlignedEnd( alignedSeq.getAlignedEnd() + len(gaps) )

        return alignedSeq
Ejemplo n.º 3
0
def _ComparePulseDataWorker(args):
    window_start, window_end, cmp_filenames, ref_infos, reference_sequence, consensus_calls, opts = args
    logging.debug("_ComparePulseDataWorker started on %s:%d..%d in %s" % (ref_infos['Modified'].fullName, window_start, window_end, cmp_filenames['Modified']))
    
    current_process().daemon = False
    
    my_filters = {}
    for condition in 'Modified', 'Control':
        my_filters[condition] = StandardAlnContextFilter(min_alignment_length=opts.min_length,
                                                       min_alignment_zscore=opts.min_z_score,
                                                       anchor=opts.anchor,
                                                       restrict_to_movies = opts.movies[condition],
                                                       restrict_by_strand = opts.strand)
    
    pulse_kinetics_values = {'Modified': {}, 'Control': {}}
    kc_stats = {}
    
    for metric in opts.metrics:
        collator = AlignedPulseKineticsCollator([cmp_filenames['Modified'], cmp_filenames['Control']],
                                                [ref_infos['Modified'], ref_infos['Control']],
                                                ref_start=window_start,
                                                ref_end=window_end,
                                                pulse_metric=metric,
                                                template_length=opts.template_length,
                                                aln_context_filters=[my_filters['Modified'], my_filters['Control']],
                                                num_threads=opts.num_threads,
                                                normalize_kinetics=opts.normalize_kinetics,
                                                target_coverage_ceiling=opts.target_coverage_ceiling)
        for condition in 'Modified', 'Control':
            pulse_kinetics_values[condition][metric] = collator.getMetricValues(cmp_filenames[condition], ref_infos[condition])

    for condition in 'Modified', 'Control':
        kc_stats[condition] = KineticContextStatistics(pulse_kinetics_values[condition])
    
    worker_csv_out_filenames = {}
    for metric in opts.metrics:
        worker_csv_out_filenames[metric] = tempfile.mkstemp(prefix=metric+"_worker"+str(os.getpid())+"_", suffix='.csv.bz2')[1]
        csv_fh = bz2.BZ2File(worker_csv_out_filenames[metric], 'w')
        csv_writer = csv.writer(csv_fh)
        header = ['RefPosn', 'NucleotideFwd', 'NucleotideRev', 'RefNucleotideFwd', 'RefNucleotideRev',
                  'ModifiedMeanFwd', 'ModifiedMedianFwd', 'ModifiedStdevFwd', 'ModifiedSpreadLoFwd', 'ModifiedSpreadHiFwd', 'ModifiedCoverageFwd',
                  'ModifiedMeanRev', 'ModifiedMedianRev', 'ModifiedStdevRev', 'ModifiedSpreadLoRev', 'ModifiedSpreadHiRev', 'ModifiedCoverageRev',
                  'ControlMeanFwd', 'ControlMedianFwd', 'ControlStdevFwd', 'ControlSpreadLoFwd', 'ControlSpreadHiFwd', 'ControlCoverageFwd',
                  'ControlMeanRev', 'ControlMedianRev', 'ControlStdevRev', 'ControlSpreadLoRev', 'ControlSpreadHiRev', 'ControlCoverageRev',
                  'LLRFwd', 'LLRRev', 'PvalueFwd', 'PvalueRev', 'OfInterestFwd', 'OfInterestRev']
        
        csv_writer.writerow(header)
        row_buffer = []
        
        def _preprocess_row(row):
            for i in range(len(row)):
                if isinstance(row[i], float) and header[i] != 'PvalueFwd' and header[i] != 'PvalueRev':
                    row[i] = "%.8f" % row[i]
            return row
        
        use_se = True
        
        ctrl_posns = pulse_kinetics_values['Control'][metric].keys()
        for ref_pos in sorted(pulse_kinetics_values['Modified'][metric].keys()):
            if ref_pos not in ctrl_posns:
                # TODO: collect stats on matching vs. no-match positions
                #logging.info("Reference position %d in modified dataset but not in control dataset" % ref_pos)
                continue

            try:
                my_nt = consensus_calls[ref_pos].upper()
                my_nt_rc = revcomp(consensus_calls[ref_pos]).upper()
            except IndexError:
                my_nt, my_nt_rc = 'N', 'N'
            
            my_ref_nt, my_ref_nt_rc = 'N', 'N'
            if reference_sequence:
                my_ref_nt = reference_sequence[ref_pos].upper()
                my_ref_nt_rc = revcomp(reference_sequence[ref_pos]).upper()
            
            my_report_row = [str(ref_pos), my_nt, my_nt_rc, my_ref_nt, my_ref_nt_rc]
            
            my_stats = {'Modified': {'+': {}, '-': {}}, 'Control': {'+': {}, '-': {}}}
            for condition in 'Modified', 'Control':
                for strand in '+', '-':
                    my_stats[condition][strand]['mean'] = kc_stats[condition].meanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand)
                    my_stats[condition][strand]['median'] = kc_stats[condition].medianMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand)
                    my_stats[condition][strand]['stdev'] = kc_stats[condition].stdevMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand)
                    #(m2_ci_lo_fwd, m2_ci_hi_fwd) = kc_stats['Control'].bootstrapCIofMeanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand='+')
                    if use_se:
#                        se = kc_stats[condition].bootstrapSEofMeanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand)
                        se = my_stats[condition][strand]['stdev'] / sqrt(len(kc_stats[condition]._getMetricDataForPos(ref_pos, metric, trim=0.0, coverage_ceiling=opts.coverage_ceiling, strand=strand)))
                        my_stats[condition][strand]['spread_lo'] = my_stats[condition][strand]['mean'] - se
                        my_stats[condition][strand]['spread_hi'] = my_stats[condition][strand]['mean'] + se
                    else:
                        my_stats[condition][strand]['spread_lo'] = my_stats[condition][strand]['mean'] - my_stats[condition][strand]['stdev']
                        my_stats[condition][strand]['spread_hi'] = my_stats[condition][strand]['mean'] + my_stats[condition][strand]['stdev']
                    my_stats[condition][strand]['coverage'] = len(kc_stats[condition]._getMetricDataForPos(ref_pos, metric, trim=0.0, coverage_ceiling=opts.coverage_ceiling, strand=strand))
                    my_report_row.extend([my_stats[condition][strand]['mean'],
                                          my_stats[condition][strand]['median'],
                                          my_stats[condition][strand]['stdev'],
                                          my_stats[condition][strand]['spread_lo'],
                                          my_stats[condition][strand]['spread_hi'],
                                          my_stats[condition][strand]['coverage']])
                    
            my_values = {'Modified': {'+': {}, '-': {}}, 'Control': {'+': {}, '-': {}}}
            my_logvalues = {'Modified': {'+': {}, '-': {}}, 'Control': {'+': {}, '-': {}}}
            for condition in 'Modified', 'Control':
                for strand in '+', '-':
                    my_values[condition][strand] = kc_stats[condition]._getMetricDataForPos(ref_pos, metric, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand)
                    my_logvalues[condition][strand] = log(my_values[condition][strand] + 0.001)
                        
            llr_fwd = llr2samp(my_values['Modified']['+'], my_values['Control']['+'], min_samples=10)
            llr_rev = llr2samp(my_values['Modified']['-'], my_values['Control']['-'], min_samples=10)
            
#            tvalue_fwd, pvalue_fwd = ttest_ind(my_logvalues['Modified']['+'], my_logvalues['Control']['+'])
#            tvalue_rev, pvalue_rev = ttest_ind(my_logvalues['Modified']['-'], my_logvalues['Control']['-'])

            tvalue_fwd, pvalue_fwd, tvalue_rev, pvalue_rev = numpy.nan, numpy.nan, numpy.nan, numpy.nan
            if len(my_logvalues['Modified']['+']) > 0 and len(my_logvalues['Control']['+']) > 0:
                tvalue_fwd, pvalue_fwd = ks_2samp(my_logvalues['Modified']['+'], my_logvalues['Control']['+'])
                
#                bg_tvalues, bg_pvalues = [], []
#                for i in range(10):
#                    s = random.sample(my_logvalues['Control']['+'], len(my_logvalues['Control']['+']))
#                    rD, rp = ks_2samp(s[:len(s)/2], s[len(s)/2:])
#                    bg_tvalues.append(rD)
#                    bg_pvalues.append(rp)
#                
#                print pvalue_fwd, mean(bg_pvalues)
                
                
            if len(my_logvalues['Modified']['-']) > 0 and len(my_logvalues['Control']['-']) > 0:
                tvalue_rev, pvalue_rev = ks_2samp(my_logvalues['Modified']['-'], my_logvalues['Control']['-'])
            of_interest_fwd = 1 if ref_pos in opts.modified_positions_fwd else 0
            of_interest_rev = 1 if ref_pos in opts.modified_positions_rev else 0
                        
            my_report_row.extend([llr_fwd, llr_rev, pvalue_fwd, pvalue_rev, of_interest_fwd, of_interest_rev])
            
            row_buffer.append(_preprocess_row(my_report_row)) 
            if len(row_buffer) > 1000:
                csv_writer.writerows(row_buffer)
                row_buffer = []
            
            if opts.plot_ipdr_histograms and metric == 'IPD' and of_interest_fwd:
                ctrl_mean = kc_stats['Control'].meanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand='+')
                # subread averaging and min subread req/multiple line plots for diff. subread counts goes here
                values = my_values['Modified']['+'] / ctrl_mean
                plot_ipdr_histogram(ref_infos['Modified'].fullName, ref_pos, '+', metric, values, opts)
            if opts.plot_ipdr_histograms and metric == 'IPD' and of_interest_rev:
                ctrl_mean = kc_stats['Control'].meanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand='-')
                values = my_values['Modified']['-'] / ctrl_mean
                plot_ipdr_histogram(ref_infos['Modified'].fullName, ref_pos, '-', metric, values, opts)
        
        csv_writer.writerows(row_buffer)
        csv_fh.close()
    return worker_csv_out_filenames
Ejemplo n.º 4
0
    def run(self):
        for condition in 'Modified', 'Control':
            cmph5_file, smrtpipe_job_name, reference_location = self._findCmpH5(self.input[condition])
            
            cmp_fh = h5py.File(cmph5_file, 'r')
            version = str(cmp_fh.attrs["Version"]) if "Version" in cmp_fh.attrs else None
            cmp_fh.close()
            if version == "PB_v0.1":
                cmph5_file = self._convertCmpTo12(cmph5_file)
            
            self.cmp_filenames[condition] = cmph5_file
            self.smrtpipe_job_names[condition] = smrtpipe_job_name

            cmp_fh = cmph5.factory.create(self.cmp_filenames[condition])
            for ref_info in cmp_fh.refInfoIterator():
                if self.restrict_ref_id[condition] and ref_info.fullName != self.restrict_ref_id[condition]:
                    continue
                self.ref_infos[condition] = ref_info; break
            
            if condition == 'Modified':
                self.reference_sequence = self._getReferenceSequence(reference_location)
                ref_name = self.ref_infos[condition].fullName
                ref_group = cmp_fh.refGroupFromFullName(ref_name)
                variants_gff_file = os.path.join(os.path.dirname(self.cmp_filenames[condition]), "variants.gff")
                if not os.path.exists(variants_gff_file):
                    variants_gff_file += ".gz"
                
                if ref_group.hasConsensus and os.path.exists(variants_gff_file):
                    variants_fh = GffIO.GffReader(variants_gff_file)
                    mapper = ConsensusIO.ConsensusReferenceMapper(cmp_fh, variants_fh, ref_names=[ref_name], gap_value=-1)
                    self.consensus_calls = mapper.consensusInRefSpace(self.ref_infos[condition])
                else:
                    self.consensus_calls = 'N'

            cmp_fh.close()
            
            if self.restrict_ref_id[condition] and condition not in self.ref_infos:
                raise StandardError('No data found matching reference name %s for condition %s' % (self.restrict_ref_id[condition], condition))

        if self.opts.window_end is None:
            self.opts.window_end = self.ref_infos['Modified'].length
            if self.opts.template_length:
                self.opts.window_end = self.opts.template_length
        
        if self.opts.window_end < self.opts.window_start:
            raise StandardError('Window start exceeds window end') 

        if self.opts.motifs:
            my_sequence = self.reference_sequence if self.reference_sequence != None else self.consensus_calls
            if my_sequence != None:
                for motif in self.opts.motifs:
                    fwd_motif_pos = [ m.start() + self.opts.pos_in_motif for m in re.finditer(motif, my_sequence, re.IGNORECASE) ]
                    self.opts.modified_positions_fwd.extend(fwd_motif_pos)
                    rev_motif_pos = [ len(my_sequence) - 1 - m.start() - self.opts.pos_in_motif for m in re.finditer(motif, revcomp(my_sequence), re.IGNORECASE) ]
                    self.opts.modified_positions_rev.extend(rev_motif_pos)
            else:
                sys.exit("Motif specified but no reference or consensus sequence found to find motifs in")
        
        if self.opts.write_zipfile:
            output_file = self.opts.output_dir
            self.opts.output_dir = tempfile.mkdtemp()
        else:
            if not os.path.isdir(self.opts.output_dir): os.mkdir(self.opts.output_dir)
        
        stride = 40000
        jobs = []
        for window_start in range(self.opts.window_start, self.opts.window_end+1, stride):
            jobs.append([window_start, min(window_start+stride-1, self.opts.window_end),
                         self.cmp_filenames, self.ref_infos, self.reference_sequence, self.consensus_calls, self.opts])
        
        if self.opts.num_threads > 1:
            p = Pool(self.opts.num_threads, maxtasksperchild=1)
            job_results = p.map_async(_ComparePulseDataWorker, jobs).get(99999999)
            try:
                p.close()
            except AttributeError:
                pass
        else:
            job_results = map(_ComparePulseDataWorker, jobs)
        
        worker_table_files = dict(zip(self.opts.metrics, ([] for i in self.opts.metrics)))
        for fileset in job_results:
            for metric, file in fileset.iteritems():
                worker_table_files[metric].append(file)

        table_files = {}
        for metric in self.opts.metrics:
#            table_files[metric] = tempfile.mkstemp(prefix=metric+"_", suffix='.csv.bz2')[1]
            table_files[metric] = os.path.join(self.opts.output_dir, metric+"_per_ref_pos.csv.bz2")
            with bz2.BZ2File(table_files[metric], 'w') as concat_csv_fh:
                header = bz2.BZ2File(worker_table_files[metric][0], 'r').readline()
                concat_csv_fh.write(header)
                for worker_csv in worker_table_files[metric]:
                    with bz2.BZ2File(worker_csv, 'r') as worker_csv_fh:
                        header = worker_csv_fh.readline()
                        for line in worker_csv_fh:
                            concat_csv_fh.write(line)
        
        if self.opts.make_report:
            child_process_handles = []
            for metric in self.opts.metrics:
                invoke_string = "makePulseKineticsReport.py '%s'" % table_files[metric]
                invoke_string += " --metric=%s" % metric
                invoke_string += " --title='%s (%s) vs. %s (%s)'" % (self.input['Modified'], self.smrtpipe_job_names['Modified'], self.input['Control'], self.smrtpipe_job_names['Control'])
                invoke_string += " --ref_name='%s'" % self.ref_infos['Modified'].fullName
                invoke_string += " --output='%s'" % self.opts.output_dir
                invoke_string += " --tempdir='%s'" % self.opts.tempdir
                if self.opts.min_coverage:
                    invoke_string += " --min_coverage=%s" % self.opts.min_coverage
                if self.opts.template_length: # proxy for LIMS template, which should be visualized with inverted strand sense
                    invoke_string += " --invert_strand_sense"
                if self.opts.save_plot_csv:
                    invoke_string += " --save_plot_csv"
                if self.opts.ylim is not None:
                    invoke_string += " --ymax=%f" % self.opts.ylim
                if self.opts.debug:
                    invoke_string += " --debug"
                logging.debug("Running %s" % invoke_string)
                ph = subprocess.Popen(invoke_string, shell=True)
                child_process_handles.append(ph)
            
            for ph in child_process_handles:
                if ph:
                    returncode = ph.wait()
                    if returncode: sys.exit(returncode)
        
        if self.opts.write_zipfile:
            z = zipfile.ZipFile(output_file, 'w')
            for f in os.listdir(self.opts.output_dir): z.write(os.path.join(self.opts.output_dir, f), f)
            z.close()