Esempio n. 1
0
def check_replica_correlation(self):
    "No usado, de momento" 
    min_tags = 20
    experiment_reader = utils.read_fetcher(self.current_experiment_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential)
    replica_reader = utils.read_fetcher(self.current_replica_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential)
    correlations_acum = 0
    num_correlations = 0
    for region_line in open(self.region_path):
        sline = region_line.split()
        region_experiment = self._region_from_sline(sline)       
        region_replica = region_experiment.copy()  
        tags_experiment = experiment_reader.get_overlaping_clusters(region_experiment, overlap=1)
        tags_replica = replica_reader.get_overlaping_clusters(region_experiment, overlap=1)
        count_experiment = len(tags_experiment)
        count_replica = len(tags_replica)
        correlations = []
        if count_experiment+count_replica > min_tags:
            region_experiment.add_tags(tags_experiment, clusterize=True)
            region_replica.add_tags(tags_replica, clusterize=True)     
            num_correlations += 1
            correlation = utils.pearson(region_experiment.get_array(), region_replica.get_array())
            correlations_acum += max(0, correlation)
            correlations.append(correlation)

    print correlations_acum/num_correlations
    try:
        if self.postscript:
            import matplotlib
            matplotlib.use("PS")
        from matplotlib.pyplot import plot, boxplot, show, legend, figure, xlabel, ylabel, subplot, axhline, axis
    except:
        __matplotlibwarn(self)
        return 0

    print correlations
    boxplot(correlations)
    self._save_figure("check_replica")    
Esempio n. 2
0
def enrichment(self):
    file_a_reader = file_b_reader = replica_reader = None
    self.use_replica = (bool(self.replica_path) or (bool(self.counts_file) and self.use_replica_flag))
    self.logger.debug("Use replica: %s"%self.use_replica)
    if not USE_MA in self.operations:
        _calculate_total_lengths(self)
    if not self.counts_file:
        file_a_reader = utils.read_fetcher(self.current_experiment_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential, only_counts=True)
        file_b_reader = utils.read_fetcher(self.current_control_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential, only_counts=True)
        if self.use_replica:
            replica_reader = utils.read_fetcher(self.current_replica_path, self.experiment_format, cached=self.cached, logger=self.logger, use_samtools=self.use_samtools, access_sequential=self.access_sequential, only_counts=True)

        if self.sorted_region_path:
            self.logger.info('Using region file %s (%s)'%(self.region_path, self.region_format))
        else:
            calculate_region(self) #create region file semi automatically

        self.total_regions = sum(1 for line in open(self.sorted_region_path))

    self.logger.info("... analyzing regions, calculating normalized counts, A / M and replica or swap...")
    self.already_norm = False
    if self.use_MA:
        ma_path = self.counts_file
    else:
        ma_path = self.sorted_region_path

    out_path = _calculate_MA(self, ma_path, bool(self.counts_file), 1, 1, file_a_reader, file_b_reader, replica_reader)
    self.already_norm = True
    self.logger.debug("Already normalized: %s"%self.already_norm)
    if self.tmm_norm:
        if CHECK_REPLICAS in self.operations: 
            self.experiment_values = []
            self.replica_values = []

        self.logger.info("TMM Normalizing...")
        tmm_factor = calc_tmm_factor(self, out_path, self.regions_analyzed_count, False)
        replica_tmm_factor = 1
        if self.use_replica:
            replica_tmm_factor = calc_tmm_factor(self, out_path, self.regions_analyzed_count, True)
        #move output file to old output
        #use as input
        old_output = '%s/notnormalized_%s'%(self._current_directory(), os.path.basename(self.current_output_path))
        move(os.path.abspath(self.current_output_path), old_output)
        out_path = _calculate_MA(self, old_output, True, tmm_factor, replica_tmm_factor, True) #recalculate with the new factor, using the counts again

    if self.quant_norm:
        self.logger.info("Full quantile normalization...")
        signal_a = []
        signal_prime_1 = []
        enrich = []
        for line in open(out_path):
            sline = line.split()
            enrich_line = dict(zip(enrichment_keys, sline))
            enrich.append(enrich_line)
            signal_a.append(float(enrich_line['signal_a']))
            signal_prime_1.append(float(enrich_line['signal_prime_1']))
        
        #full quantile normalization
        signal_a.sort()
        enrich.sort(key=lambda x:float(x['signal_b'])) 
        quant_counts = open('%s/quantcounts_%s'%(self._current_directory(), os.path.basename(self.current_output_path)), 'w')
        for i in range(len(enrich)):
            enrich[i]['signal_b'] = signal_a[i] 

        self.logger.info("Full quantile normalization replica...")
        #full quantile normalization of the replica
        signal_prime_1.sort()
        enrich.sort(key=lambda x:float(x['signal_prime_2']))
        for i in range(len(enrich)):
            enrich[i]['signal_prime_2'] = signal_prime_1[i]         
            quant_counts.write("%s\n"%"\t".join(str(enrich[i][key]) for key in enrichment_keys[:20])) #write the lines

        quant_counts.flush()
        out_path = _calculate_MA(self, quant_counts.name, True, 1, 1, True) #recalculate with the new factor, using the counts again
        self._manage_temp_file(quant_counts.name)

    self.logger.info("%s regions analyzed."%self.regions_analyzed_count)
    if not NOWRITE in self.operations:
        self.logger.info("Enrichment result saved to %s"%self.current_output_path)

    if CHECK_REPLICAS in self.operations:
        check_replica(self)

    return out_path