def toAlignmentHit(self, querySequence, contig): """Convert this AmosTileMessage to an AlignmentHit object. The message does not contain the read or target sequences, so we need to pass these in. Note this currently does not maintain the target sequence.""" alignedSeq = self.toAlignedSequence(querySequence) hit = AlignmentHit() hit.query_id = alignedSeq.seqName hit.target_id = "%s_contig" % contig['iid'] clearRange = map(int, self['clr'].split(',')) hit.query_start = min(clearRange) hit.query_end = max(clearRange) hit.target_start = alignedSeq.getAlignedStart() hit.target_end = alignedSeq.getAlignedEnd() alignedQuery = str(alignedSeq) contigAlignedSeq = "A" * (hit.target_end - hit.target_start) if clearRange[0] < clearRange[1]: hit.target_strand = "+" hit.alignedQuery = alignedQuery hit.alignedTarget = contigAlignedSeq else: hit.target_strand = "-" hit.alignedQuery = revcomp(str(alignedQuery)) hit.alignedTarget = revcomp(contigAlignedSeq) hit.query_strand = "+" # for HDF5, query strand is always positive return hit
def toAlignedSequence(self, sequence): """Convert this AmosTileMessage to an AlignedSequence object. The message does not have the read sequence, so we need to pass it in.""" clr = map(int, self['clr'].split(',')) end = max(clr[0], clr[1]) if clr[1] < clr[0]: # we need to reverse complement the sequence sequence = revcomp( sequence ) alignedSeq = AlignedSequence( sequence ) off = int(self['off']) alignedSeq.setAlignedStart( off ) alignedSeq.setAlignedEnd( off + end ) alignedSeq.seqName = self['src'] gaps = [] if (self['gap'] != ""): gaps = map(lambda x: int(x) - 1, self['gap'].split(' ')) alignedSeq.insertGaps(gaps, offset=off ) alignedSeq.setAlignedEnd( alignedSeq.getAlignedEnd() + len(gaps) ) return alignedSeq
def _ComparePulseDataWorker(args): window_start, window_end, cmp_filenames, ref_infos, reference_sequence, consensus_calls, opts = args logging.debug("_ComparePulseDataWorker started on %s:%d..%d in %s" % (ref_infos['Modified'].fullName, window_start, window_end, cmp_filenames['Modified'])) current_process().daemon = False my_filters = {} for condition in 'Modified', 'Control': my_filters[condition] = StandardAlnContextFilter(min_alignment_length=opts.min_length, min_alignment_zscore=opts.min_z_score, anchor=opts.anchor, restrict_to_movies = opts.movies[condition], restrict_by_strand = opts.strand) pulse_kinetics_values = {'Modified': {}, 'Control': {}} kc_stats = {} for metric in opts.metrics: collator = AlignedPulseKineticsCollator([cmp_filenames['Modified'], cmp_filenames['Control']], [ref_infos['Modified'], ref_infos['Control']], ref_start=window_start, ref_end=window_end, pulse_metric=metric, template_length=opts.template_length, aln_context_filters=[my_filters['Modified'], my_filters['Control']], num_threads=opts.num_threads, normalize_kinetics=opts.normalize_kinetics, target_coverage_ceiling=opts.target_coverage_ceiling) for condition in 'Modified', 'Control': pulse_kinetics_values[condition][metric] = collator.getMetricValues(cmp_filenames[condition], ref_infos[condition]) for condition in 'Modified', 'Control': kc_stats[condition] = KineticContextStatistics(pulse_kinetics_values[condition]) worker_csv_out_filenames = {} for metric in opts.metrics: worker_csv_out_filenames[metric] = tempfile.mkstemp(prefix=metric+"_worker"+str(os.getpid())+"_", suffix='.csv.bz2')[1] csv_fh = bz2.BZ2File(worker_csv_out_filenames[metric], 'w') csv_writer = csv.writer(csv_fh) header = ['RefPosn', 'NucleotideFwd', 'NucleotideRev', 'RefNucleotideFwd', 'RefNucleotideRev', 'ModifiedMeanFwd', 'ModifiedMedianFwd', 'ModifiedStdevFwd', 'ModifiedSpreadLoFwd', 'ModifiedSpreadHiFwd', 'ModifiedCoverageFwd', 'ModifiedMeanRev', 'ModifiedMedianRev', 'ModifiedStdevRev', 'ModifiedSpreadLoRev', 'ModifiedSpreadHiRev', 'ModifiedCoverageRev', 'ControlMeanFwd', 'ControlMedianFwd', 'ControlStdevFwd', 'ControlSpreadLoFwd', 'ControlSpreadHiFwd', 'ControlCoverageFwd', 'ControlMeanRev', 'ControlMedianRev', 'ControlStdevRev', 'ControlSpreadLoRev', 'ControlSpreadHiRev', 'ControlCoverageRev', 'LLRFwd', 'LLRRev', 'PvalueFwd', 'PvalueRev', 'OfInterestFwd', 'OfInterestRev'] csv_writer.writerow(header) row_buffer = [] def _preprocess_row(row): for i in range(len(row)): if isinstance(row[i], float) and header[i] != 'PvalueFwd' and header[i] != 'PvalueRev': row[i] = "%.8f" % row[i] return row use_se = True ctrl_posns = pulse_kinetics_values['Control'][metric].keys() for ref_pos in sorted(pulse_kinetics_values['Modified'][metric].keys()): if ref_pos not in ctrl_posns: # TODO: collect stats on matching vs. no-match positions #logging.info("Reference position %d in modified dataset but not in control dataset" % ref_pos) continue try: my_nt = consensus_calls[ref_pos].upper() my_nt_rc = revcomp(consensus_calls[ref_pos]).upper() except IndexError: my_nt, my_nt_rc = 'N', 'N' my_ref_nt, my_ref_nt_rc = 'N', 'N' if reference_sequence: my_ref_nt = reference_sequence[ref_pos].upper() my_ref_nt_rc = revcomp(reference_sequence[ref_pos]).upper() my_report_row = [str(ref_pos), my_nt, my_nt_rc, my_ref_nt, my_ref_nt_rc] my_stats = {'Modified': {'+': {}, '-': {}}, 'Control': {'+': {}, '-': {}}} for condition in 'Modified', 'Control': for strand in '+', '-': my_stats[condition][strand]['mean'] = kc_stats[condition].meanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand) my_stats[condition][strand]['median'] = kc_stats[condition].medianMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand) my_stats[condition][strand]['stdev'] = kc_stats[condition].stdevMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand) #(m2_ci_lo_fwd, m2_ci_hi_fwd) = kc_stats['Control'].bootstrapCIofMeanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand='+') if use_se: # se = kc_stats[condition].bootstrapSEofMeanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand) se = my_stats[condition][strand]['stdev'] / sqrt(len(kc_stats[condition]._getMetricDataForPos(ref_pos, metric, trim=0.0, coverage_ceiling=opts.coverage_ceiling, strand=strand))) my_stats[condition][strand]['spread_lo'] = my_stats[condition][strand]['mean'] - se my_stats[condition][strand]['spread_hi'] = my_stats[condition][strand]['mean'] + se else: my_stats[condition][strand]['spread_lo'] = my_stats[condition][strand]['mean'] - my_stats[condition][strand]['stdev'] my_stats[condition][strand]['spread_hi'] = my_stats[condition][strand]['mean'] + my_stats[condition][strand]['stdev'] my_stats[condition][strand]['coverage'] = len(kc_stats[condition]._getMetricDataForPos(ref_pos, metric, trim=0.0, coverage_ceiling=opts.coverage_ceiling, strand=strand)) my_report_row.extend([my_stats[condition][strand]['mean'], my_stats[condition][strand]['median'], my_stats[condition][strand]['stdev'], my_stats[condition][strand]['spread_lo'], my_stats[condition][strand]['spread_hi'], my_stats[condition][strand]['coverage']]) my_values = {'Modified': {'+': {}, '-': {}}, 'Control': {'+': {}, '-': {}}} my_logvalues = {'Modified': {'+': {}, '-': {}}, 'Control': {'+': {}, '-': {}}} for condition in 'Modified', 'Control': for strand in '+', '-': my_values[condition][strand] = kc_stats[condition]._getMetricDataForPos(ref_pos, metric, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand=strand) my_logvalues[condition][strand] = log(my_values[condition][strand] + 0.001) llr_fwd = llr2samp(my_values['Modified']['+'], my_values['Control']['+'], min_samples=10) llr_rev = llr2samp(my_values['Modified']['-'], my_values['Control']['-'], min_samples=10) # tvalue_fwd, pvalue_fwd = ttest_ind(my_logvalues['Modified']['+'], my_logvalues['Control']['+']) # tvalue_rev, pvalue_rev = ttest_ind(my_logvalues['Modified']['-'], my_logvalues['Control']['-']) tvalue_fwd, pvalue_fwd, tvalue_rev, pvalue_rev = numpy.nan, numpy.nan, numpy.nan, numpy.nan if len(my_logvalues['Modified']['+']) > 0 and len(my_logvalues['Control']['+']) > 0: tvalue_fwd, pvalue_fwd = ks_2samp(my_logvalues['Modified']['+'], my_logvalues['Control']['+']) # bg_tvalues, bg_pvalues = [], [] # for i in range(10): # s = random.sample(my_logvalues['Control']['+'], len(my_logvalues['Control']['+'])) # rD, rp = ks_2samp(s[:len(s)/2], s[len(s)/2:]) # bg_tvalues.append(rD) # bg_pvalues.append(rp) # # print pvalue_fwd, mean(bg_pvalues) if len(my_logvalues['Modified']['-']) > 0 and len(my_logvalues['Control']['-']) > 0: tvalue_rev, pvalue_rev = ks_2samp(my_logvalues['Modified']['-'], my_logvalues['Control']['-']) of_interest_fwd = 1 if ref_pos in opts.modified_positions_fwd else 0 of_interest_rev = 1 if ref_pos in opts.modified_positions_rev else 0 my_report_row.extend([llr_fwd, llr_rev, pvalue_fwd, pvalue_rev, of_interest_fwd, of_interest_rev]) row_buffer.append(_preprocess_row(my_report_row)) if len(row_buffer) > 1000: csv_writer.writerows(row_buffer) row_buffer = [] if opts.plot_ipdr_histograms and metric == 'IPD' and of_interest_fwd: ctrl_mean = kc_stats['Control'].meanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand='+') # subread averaging and min subread req/multiple line plots for diff. subread counts goes here values = my_values['Modified']['+'] / ctrl_mean plot_ipdr_histogram(ref_infos['Modified'].fullName, ref_pos, '+', metric, values, opts) if opts.plot_ipdr_histograms and metric == 'IPD' and of_interest_rev: ctrl_mean = kc_stats['Control'].meanMetric(metric, ref_positions=ref_pos, trim=opts.exp_trim, coverage_ceiling=opts.coverage_ceiling, strand='-') values = my_values['Modified']['-'] / ctrl_mean plot_ipdr_histogram(ref_infos['Modified'].fullName, ref_pos, '-', metric, values, opts) csv_writer.writerows(row_buffer) csv_fh.close() return worker_csv_out_filenames
def run(self): for condition in 'Modified', 'Control': cmph5_file, smrtpipe_job_name, reference_location = self._findCmpH5(self.input[condition]) cmp_fh = h5py.File(cmph5_file, 'r') version = str(cmp_fh.attrs["Version"]) if "Version" in cmp_fh.attrs else None cmp_fh.close() if version == "PB_v0.1": cmph5_file = self._convertCmpTo12(cmph5_file) self.cmp_filenames[condition] = cmph5_file self.smrtpipe_job_names[condition] = smrtpipe_job_name cmp_fh = cmph5.factory.create(self.cmp_filenames[condition]) for ref_info in cmp_fh.refInfoIterator(): if self.restrict_ref_id[condition] and ref_info.fullName != self.restrict_ref_id[condition]: continue self.ref_infos[condition] = ref_info; break if condition == 'Modified': self.reference_sequence = self._getReferenceSequence(reference_location) ref_name = self.ref_infos[condition].fullName ref_group = cmp_fh.refGroupFromFullName(ref_name) variants_gff_file = os.path.join(os.path.dirname(self.cmp_filenames[condition]), "variants.gff") if not os.path.exists(variants_gff_file): variants_gff_file += ".gz" if ref_group.hasConsensus and os.path.exists(variants_gff_file): variants_fh = GffIO.GffReader(variants_gff_file) mapper = ConsensusIO.ConsensusReferenceMapper(cmp_fh, variants_fh, ref_names=[ref_name], gap_value=-1) self.consensus_calls = mapper.consensusInRefSpace(self.ref_infos[condition]) else: self.consensus_calls = 'N' cmp_fh.close() if self.restrict_ref_id[condition] and condition not in self.ref_infos: raise StandardError('No data found matching reference name %s for condition %s' % (self.restrict_ref_id[condition], condition)) if self.opts.window_end is None: self.opts.window_end = self.ref_infos['Modified'].length if self.opts.template_length: self.opts.window_end = self.opts.template_length if self.opts.window_end < self.opts.window_start: raise StandardError('Window start exceeds window end') if self.opts.motifs: my_sequence = self.reference_sequence if self.reference_sequence != None else self.consensus_calls if my_sequence != None: for motif in self.opts.motifs: fwd_motif_pos = [ m.start() + self.opts.pos_in_motif for m in re.finditer(motif, my_sequence, re.IGNORECASE) ] self.opts.modified_positions_fwd.extend(fwd_motif_pos) rev_motif_pos = [ len(my_sequence) - 1 - m.start() - self.opts.pos_in_motif for m in re.finditer(motif, revcomp(my_sequence), re.IGNORECASE) ] self.opts.modified_positions_rev.extend(rev_motif_pos) else: sys.exit("Motif specified but no reference or consensus sequence found to find motifs in") if self.opts.write_zipfile: output_file = self.opts.output_dir self.opts.output_dir = tempfile.mkdtemp() else: if not os.path.isdir(self.opts.output_dir): os.mkdir(self.opts.output_dir) stride = 40000 jobs = [] for window_start in range(self.opts.window_start, self.opts.window_end+1, stride): jobs.append([window_start, min(window_start+stride-1, self.opts.window_end), self.cmp_filenames, self.ref_infos, self.reference_sequence, self.consensus_calls, self.opts]) if self.opts.num_threads > 1: p = Pool(self.opts.num_threads, maxtasksperchild=1) job_results = p.map_async(_ComparePulseDataWorker, jobs).get(99999999) try: p.close() except AttributeError: pass else: job_results = map(_ComparePulseDataWorker, jobs) worker_table_files = dict(zip(self.opts.metrics, ([] for i in self.opts.metrics))) for fileset in job_results: for metric, file in fileset.iteritems(): worker_table_files[metric].append(file) table_files = {} for metric in self.opts.metrics: # table_files[metric] = tempfile.mkstemp(prefix=metric+"_", suffix='.csv.bz2')[1] table_files[metric] = os.path.join(self.opts.output_dir, metric+"_per_ref_pos.csv.bz2") with bz2.BZ2File(table_files[metric], 'w') as concat_csv_fh: header = bz2.BZ2File(worker_table_files[metric][0], 'r').readline() concat_csv_fh.write(header) for worker_csv in worker_table_files[metric]: with bz2.BZ2File(worker_csv, 'r') as worker_csv_fh: header = worker_csv_fh.readline() for line in worker_csv_fh: concat_csv_fh.write(line) if self.opts.make_report: child_process_handles = [] for metric in self.opts.metrics: invoke_string = "makePulseKineticsReport.py '%s'" % table_files[metric] invoke_string += " --metric=%s" % metric invoke_string += " --title='%s (%s) vs. %s (%s)'" % (self.input['Modified'], self.smrtpipe_job_names['Modified'], self.input['Control'], self.smrtpipe_job_names['Control']) invoke_string += " --ref_name='%s'" % self.ref_infos['Modified'].fullName invoke_string += " --output='%s'" % self.opts.output_dir invoke_string += " --tempdir='%s'" % self.opts.tempdir if self.opts.min_coverage: invoke_string += " --min_coverage=%s" % self.opts.min_coverage if self.opts.template_length: # proxy for LIMS template, which should be visualized with inverted strand sense invoke_string += " --invert_strand_sense" if self.opts.save_plot_csv: invoke_string += " --save_plot_csv" if self.opts.ylim is not None: invoke_string += " --ymax=%f" % self.opts.ylim if self.opts.debug: invoke_string += " --debug" logging.debug("Running %s" % invoke_string) ph = subprocess.Popen(invoke_string, shell=True) child_process_handles.append(ph) for ph in child_process_handles: if ph: returncode = ph.wait() if returncode: sys.exit(returncode) if self.opts.write_zipfile: z = zipfile.ZipFile(output_file, 'w') for f in os.listdir(self.opts.output_dir): z.write(os.path.join(self.opts.output_dir, f), f) z.close()