def check_replica_correlation(self): "No usado, de momento" min_tags = 20 experiment_reader = utils.read_fetcher(self.current_experiment_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential) replica_reader = utils.read_fetcher(self.current_replica_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential) correlations_acum = 0 num_correlations = 0 for region_line in open(self.region_path): sline = region_line.split() region_experiment = self._region_from_sline(sline) region_replica = region_experiment.copy() tags_experiment = experiment_reader.get_overlaping_clusters(region_experiment, overlap=1) tags_replica = replica_reader.get_overlaping_clusters(region_experiment, overlap=1) count_experiment = len(tags_experiment) count_replica = len(tags_replica) correlations = [] if count_experiment+count_replica > min_tags: region_experiment.add_tags(tags_experiment, clusterize=True) region_replica.add_tags(tags_replica, clusterize=True) num_correlations += 1 correlation = utils.pearson(region_experiment.get_array(), region_replica.get_array()) correlations_acum += max(0, correlation) correlations.append(correlation) print correlations_acum/num_correlations try: if self.postscript: import matplotlib matplotlib.use("PS") from matplotlib.pyplot import plot, boxplot, show, legend, figure, xlabel, ylabel, subplot, axhline, axis except: __matplotlibwarn(self) return 0 print correlations boxplot(correlations) self._save_figure("check_replica")
def enrichment(self): file_a_reader = file_b_reader = replica_reader = None self.use_replica = (bool(self.replica_path) or (bool(self.counts_file) and self.use_replica_flag)) self.logger.debug("Use replica: %s"%self.use_replica) if not USE_MA in self.operations: _calculate_total_lengths(self) if not self.counts_file: file_a_reader = utils.read_fetcher(self.current_experiment_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential, only_counts=True) file_b_reader = utils.read_fetcher(self.current_control_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential, only_counts=True) if self.use_replica: replica_reader = utils.read_fetcher(self.current_replica_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential, only_counts=True) if self.sorted_region_path: self.logger.info('Using region file %s (%s)'%(self.region_path, self.region_format)) else: calculate_region(self) #create region file semi automatically self.total_regions = sum(1 for line in open(self.sorted_region_path)) self.logger.info("... analyzing regions, calculating normalized counts, A / M and replica or swap...") self.already_norm = False if self.use_MA: ma_path = self.counts_file else: ma_path = self.sorted_region_path out_path = _calculate_MA(self, ma_path, bool(self.counts_file), 1, 1, file_a_reader, file_b_reader, replica_reader) self.already_norm = True self.logger.debug("Already normalized: %s"%self.already_norm) if self.tmm_norm: if CHECK_REPLICAS in self.operations: self.experiment_values = [] self.replica_values = [] self.logger.info("TMM Normalizing...") tmm_factor = calc_tmm_factor(self, out_path, self.regions_analyzed_count, False) replica_tmm_factor = 1 if self.use_replica: replica_tmm_factor = calc_tmm_factor(self, out_path, self.regions_analyzed_count, True) #move output file to old output #use as input old_output = '%s/notnormalized_%s'%(self._current_directory(), os.path.basename(self.current_output_path)) move(os.path.abspath(self.current_output_path), old_output) out_path = _calculate_MA(self, old_output, True, tmm_factor, replica_tmm_factor, True) #recalculate with the new factor, using the counts again if self.quant_norm: self.logger.info("Full quantile normalization...") signal_a = [] signal_prime_1 = [] enrich = [] for line in open(out_path): sline = line.split() enrich_line = dict(zip(enrichment_keys, sline)) enrich.append(enrich_line) signal_a.append(float(enrich_line['signal_a'])) signal_prime_1.append(float(enrich_line['signal_prime_1'])) #full quantile normalization signal_a.sort() enrich.sort(key=lambda x:float(x['signal_b'])) quant_counts = open('%s/quantcounts_%s'%(self._current_directory(), os.path.basename(self.current_output_path)), 'w') for i in range(len(enrich)): enrich[i]['signal_b'] = signal_a[i] self.logger.info("Full quantile normalization replica...") #full quantile normalization of the replica signal_prime_1.sort() enrich.sort(key=lambda x:float(x['signal_prime_2'])) for i in range(len(enrich)): enrich[i]['signal_prime_2'] = signal_prime_1[i] quant_counts.write("%s\n"%"\t".join(str(enrich[i][key]) for key in enrichment_keys[:20])) #write the lines quant_counts.flush() out_path = _calculate_MA(self, quant_counts.name, True, 1, 1, True) #recalculate with the new factor, using the counts again self._manage_temp_file(quant_counts.name) self.logger.info("%s regions analyzed."%self.regions_analyzed_count) if not NOWRITE in self.operations: self.logger.info("Enrichment result saved to %s"%self.current_output_path) if CHECK_REPLICAS in self.operations: check_replica(self) return out_path