Esempio n. 1
0
    def set_background(self, fname=None, genome=None, length=200, nseq=10000):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the 
        name of a FASTA file.
        
        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        length : int, optional
            Length of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        length = int(length)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
                logger.info(
                    "Using default background: genome {} with length {}".
                    format(genome, length))
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.info("Using background: genome {} with length {}".format(
            genome, length))
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "{}\{}".format(genome, int(length))
            fa = cache.get(self.background_hash)
            if not fa:
                fa = RandomGenomicFasta(genome, length, nseq)
                cache.set(self.background_hash, fa)
        self.background = fa
Esempio n. 2
0
    def set_background(self,
                       fname=None,
                       genome=None,
                       size=200,
                       nseq=10000,
                       gc=False,
                       gc_bins=None):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the
        name of a FASTA file.

        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        size : int, optional
            Size of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        if self.background:
            return

        size = int(size)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.debug("using background: genome {} with size {}".format(
            genome, size))
        lock.acquire()
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "d{}:{}:{}:{}".format(
                genome, int(size), gc, str(gc_bins))
            c = cache.get(self.background_hash)
            if c:
                fa, gc_bins = c
            else:
                fa = None

            if not fa:
                if gc:

                    if gc_bins is None:
                        gc_bins = [(0.0, 0.2), (0.8, 1)]
                        for b in np.arange(0.2, 0.799, 0.05):
                            gc_bins.append((b, b + 0.05))

                    with NamedTemporaryFile() as tmp:
                        logger.info("using {} sequences".format(nseq))
                        gc_bin_bedfile(tmp.name,
                                       genome,
                                       number=nseq,
                                       length=size,
                                       bins=gc_bins)
                        fa = as_fasta(tmp.name, genome=genome)
                else:
                    fa = RandomGenomicFasta(genome, size, nseq)
                cache.set(self.background_hash, (fa, gc_bins))
        lock.release()

        self.background = fa
        if gc_bins:
            self.gc_bins = gc_bins
Esempio n. 3
0
    def set_threshold(self,
                      fpr=None,
                      threshold=None,
                      genome=None,
                      length=200,
                      filename=None):
        """Set motif scanning threshold based on background sequences.

        Parameters
        ----------
        fpr : float, optional
            Desired FPR, between 0.0 and 1.0.

        threshold : float or str, optional
            Desired motif threshold, expressed as the fraction of the 
            difference between minimum and maximum score of the PWM.
            Should either be a float between 0.0 and 1.0 or a filename
            with thresholds as created by 'gimme threshold'.

        """
        if threshold:
            if fpr:
                raise ValueError("Need either fpr or threshold.")
            if genome:
                sys.stderr.write(
                    "Parameter genome ignored when threshold is specified.\n"
                    "Did you want to use fpr?\n")
            if filename:
                sys.stderr.write(
                    "Parameter filename ignored when threshold is specified.\n"
                    "Did you want to use fpr?\n")

        if genome and filename:
            raise ValueError("Need either genome or filename.")

        if fpr:
            fpr = float(fpr)
            if not (0.0 < fpr < 1.0):
                raise ValueError("Parameter fpr should be between 0 and 1")

        if not self.motifs:
            raise ValueError("please run set_motifs() first")

        thresholds = {}
        with open(self.motifs) as f:
            motifs = read_motifs(f)

        if threshold is not None:
            self.threshold = parse_threshold_values(self.motifs, threshold)
            return

        if filename:
            if not os.path.exists(filename):
                raise IOError("File {} does not exist.".format(filename))

            bg_hash = file_checksum(filename)
            seqs = Fasta(filename).seqs
        elif genome:
            bg_hash = "{}\{}".format(genome, int(length))
        else:
            raise ValueError("Need genome or filename")

        with Cache(CACHE_DIR) as cache:
            scan_motifs = []
            for motif in motifs:
                k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr)

                threshold = cache.get(k)
                if threshold is None:
                    scan_motifs.append(motif)
                else:
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    else:
                        thresholds[motif.id] = threshold

            if len(scan_motifs) > 0:
                if genome:
                    Genome(genome)
                    sys.stderr.write(
                        "Determining threshold for fpr {} and length {} based on {}\n"
                        .format(fpr, int(length), genome))
                    fa = RandomGenomicFasta(genome, length, 10000)
                    seqs = fa.seqs
                else:
                    sys.stderr.write(
                        "Determining threshold for fpr {} based on {}\n".
                        format(fpr, filename))
                for motif, threshold in self._threshold_from_seqs(
                        scan_motifs, seqs, fpr):
                    k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr)
                    cache.set(k, threshold)
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    else:
                        thresholds[motif.id] = threshold

        self.threshold_str = "{}_{}_{}_{}_{}".format(fpr, threshold, genome,
                                                     length, filename)
        self.threshold = thresholds