Beispiel #1
0
def check_threshold(outdir, genome, scoring="count"):
    # gimme_motifs config, to get defaults
    config = MotifConfig()
    
    threshold_file = None
    if scoring == "count":
        # Motif scanning threshold
        threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome))
        if not os.path.exists(threshold_file):
        # Random sequences from genome
            index_dir = os.path.join(config.get_index_dir(), genome)
            bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
            if not os.path.exists(bg_file):
                m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
                m.writefasta(bg_file)
    
            pwmfile = config.get_default_params().get("motif_db")
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
            
            cmd = "gimme threshold {} {} {} > {}".format(
                    pwmfile,
                    bg_file,
                    FDR,
                    threshold_file)
            sp.call(cmd, shell=True)
        return threshold_file
Beispiel #2
0
    def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn("Are you sure about the Markov model? It seems too high!")
            else:
                order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
                self.logger.debug("Creating random background (%s order Markov)" % order)

            m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg))
            m.writefasta(outfile)
            self.logger.debug("Random background: %s", outfile)
            # return the number of random sequences created
            return len(m)
        elif bg_type == "genomic":
            self.logger.debug("Creating genomic background")
            index_dir = os.path.join(self.config.get_index_dir(), organism)
            f = RandomGenomicFasta(index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            return len(f)
        elif bg_type == "gc":
            self.logger.debug("Creating GC matched background")

            f = MatchedGcFasta(fafile, organism, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("GC matched background: %s", outfile)
            return len(f)
        elif bg_type == "promoter":
            gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
            index_dir = os.path.join(self.config.get_index_dir(), organism)

            self.logger.info(
                    "Creating random promoter background (%s, using genes in %s)", 
                    organism, gene_file)
            f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("Random promoter background: %s", outfile)
            return len(f)
        elif bg_type == "user":
            bg_file = self.params["user_background"]
            if not os.path.exists(bg_file):
                self.logger.error(
                        "User-specified background file %s does not exist!", 
                        bg_file)
                sys.exit(1)
            else:
                self.logger.info("Copying user-specified background file %s to %s.",
                        bg_file, outfile)
                fa = Fasta(bg_file)
                l = median([len(seq) for seq in fa.seqs])
                if l < width * 0.95 or l > width * 1.05:
                    self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width)
                fa.writefasta(outfile)
                return len(fa)
Beispiel #3
0
    def set_background(self, fname=None, genome=None, length=200, nseq=10000):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the 
        name of a FASTA file.
        
        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        length : int, optional
            Length of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        length = int(length)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
                logger.info(
                    "Using default background: genome {} with length {}".
                    format(genome, length))
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.info("Using background: genome {} with length {}".format(
            genome, length))
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "{}\{}".format(genome, int(length))
            fa = cache.get(self.background_hash)
            if not fa:
                fa = RandomGenomicFasta(genome, length, nseq)
                cache.set(self.background_hash, fa)
        self.background = fa
Beispiel #4
0
def check_threshold(outdir, genome, scoring="count"):
    # gimme_motifs config, to get defaults
    config = MotifConfig()

    threshold_file = None
    if scoring == "count":
        # Motif scanning threshold
        threshold_file = os.path.join(outdir,
                                      "threshold.{}.txt".format(genome))
        if not os.path.exists(threshold_file):
            # Random sequences from genome
            index_dir = os.path.join(config.get_index_dir(), genome)
            bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
            if not os.path.exists(bg_file):
                m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
                m.writefasta(bg_file)

            pwmfile = config.get_default_params().get("motif_db")
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

            cmd = "gimme threshold {} {} {} > {}".format(
                pwmfile, bg_file, FDR, threshold_file)
            sp.call(cmd, shell=True)
        return threshold_file
Beispiel #5
0
    def _create_background(self,
                           bg_type,
                           bedfile,
                           fafile,
                           outfile,
                           organism="hg18",
                           width=200,
                           nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn(
                    "Are you sure about the Markov model? It seems too high!")
            else:
                order = {
                    "1": "1st",
                    "2": "2nd",
                    "3": "3rd",
                    "4": "4th",
                    "5": "5th"
                }[str(self.markov_model)]
                self.logger.debug(
                    "Creating random background (%s order Markov)" % order)

            m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg))
            m.writefasta(outfile)
            self.logger.debug("Random background: %s", outfile)
            # return the number of random sequences created
            return len(m)
        elif bg_type == "genomic":
            self.logger.debug("Creating genomic background")
            index_dir = os.path.join(self.config.get_index_dir(), organism)
            f = RandomGenomicFasta(index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            return len(f)
        elif bg_type == "gc":
            self.logger.debug("Creating GC matched background")

            f = MatchedGcFasta(fafile, organism, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("GC matched background: %s", outfile)
            return len(f)
        elif bg_type == "promoter":
            gene_file = os.path.join(self.config.get_gene_dir(),
                                     "%s.bed" % organism)
            index_dir = os.path.join(self.config.get_index_dir(), organism)

            self.logger.info(
                "Creating random promoter background (%s, using genes in %s)",
                organism, gene_file)
            f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("Random promoter background: %s", outfile)
            return len(f)
        elif bg_type == "user":
            bg_file = self.params["user_background"]
            if not os.path.exists(bg_file):
                self.logger.error(
                    "User-specified background file %s does not exist!",
                    bg_file)
                sys.exit(1)
            else:
                self.logger.info(
                    "Copying user-specified background file %s to %s.",
                    bg_file, outfile)
                fa = Fasta(bg_file)
                l = median([len(seq) for seq in fa.seqs])
                if l < width * 0.95 or l > width * 1.05:
                    self.logger.warn(
                        "The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.",
                        bg_file, l, width)
                fa.writefasta(outfile)
                return len(fa)
Beispiel #6
0
def create_background(
    bg_type,
    fafile,
    outfile,
    genome="hg18",
    size=200,
    nr_times=10,
    custom_background=None,
):
    """Create background of a specific type.

    Parameters
    ----------
    bg_type : str
        Name of background type.

    fafile : str
        Name of input FASTA file.

    outfile : str
        Name of output FASTA file.

    genome : str, optional
        Genome name.

    size : int, optional
        Size of regions.

    nr_times : int, optional
        Generate this times as many background sequences as compared to
        input file.

    Returns
    -------
    nr_seqs  : int
        Number of sequences created.
    """
    size = int(size)
    config = MotifConfig()
    fg = Fasta(fafile)

    if bg_type in ["genomic", "gc"]:
        if not genome:
            logger.error("Need a genome to create background")
            sys.exit(1)

    if bg_type == "random":
        f = MarkovFasta(fg, k=1, n=nr_times * len(fg))
        logger.debug("Random background: %s", outfile)
    elif bg_type == "genomic":
        logger.debug("Creating genomic background")
        f = RandomGenomicFasta(genome, size, nr_times * len(fg))
    elif bg_type == "gc":
        logger.debug("Creating GC matched background")
        f = MatchedGcFasta(fafile, genome, nr_times * len(fg))
        logger.debug("GC matched background: %s", outfile)
    elif bg_type == "promoter":
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome)
        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}")
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            raise ValueError()

        logger.info(
            "Creating random promoter background (%s, using genes in %s)",
            genome,
            gene_file,
        )
        f = PromoterFasta(gene_file, genome, size, nr_times * len(fg))
        logger.debug("Random promoter background: %s", outfile)
    elif bg_type == "custom":
        bg_file = custom_background
        if not bg_file:
            raise IOError("Background file not specified!")

        if not os.path.exists(bg_file):
            raise IOError("Custom background file %s does not exist!", bg_file)
        else:
            logger.info("Copying custom background file %s to %s.", bg_file,
                        outfile)
            f = Fasta(bg_file)
            median_length = np.median([len(seq) for seq in f.seqs])
            if median_length < (size * 0.95) or median_length > (size * 1.05):
                logger.warn(
                    "The custom background file %s contains sequences with a "
                    "median size of %s, while GimmeMotifs predicts motifs in sequences "
                    "of size %s. This will influence the statistics! It is recommended "
                    "to use background sequences of the same size.",
                    bg_file,
                    median_length,
                    size,
                )

    f.writefasta(outfile)
    return len(f)
Beispiel #7
0
    def set_background(self,
                       fname=None,
                       genome=None,
                       size=200,
                       nseq=10000,
                       gc=False,
                       gc_bins=None):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the
        name of a FASTA file.

        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        size : int, optional
            Size of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        if self.background:
            return

        size = int(size)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.debug("using background: genome {} with size {}".format(
            genome, size))
        lock.acquire()
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "d{}:{}:{}:{}".format(
                genome, int(size), gc, str(gc_bins))
            c = cache.get(self.background_hash)
            if c:
                fa, gc_bins = c
            else:
                fa = None

            if not fa:
                if gc:

                    if gc_bins is None:
                        gc_bins = [(0.0, 0.2), (0.8, 1)]
                        for b in np.arange(0.2, 0.799, 0.05):
                            gc_bins.append((b, b + 0.05))

                    with NamedTemporaryFile() as tmp:
                        logger.info("using {} sequences".format(nseq))
                        gc_bin_bedfile(tmp.name,
                                       genome,
                                       number=nseq,
                                       length=size,
                                       bins=gc_bins)
                        fa = as_fasta(tmp.name, genome=genome)
                else:
                    fa = RandomGenomicFasta(genome, size, nseq)
                cache.set(self.background_hash, (fa, gc_bins))
        lock.release()

        self.background = fa
        if gc_bins:
            self.gc_bins = gc_bins
Beispiel #8
0
    def set_threshold(self,
                      fpr=None,
                      threshold=None,
                      genome=None,
                      length=200,
                      filename=None):
        """Set motif scanning threshold based on background sequences.

        Parameters
        ----------
        fpr : float, optional
            Desired FPR, between 0.0 and 1.0.

        threshold : float or str, optional
            Desired motif threshold, expressed as the fraction of the 
            difference between minimum and maximum score of the PWM.
            Should either be a float between 0.0 and 1.0 or a filename
            with thresholds as created by 'gimme threshold'.

        """
        if threshold:
            if fpr:
                raise ValueError("Need either fpr or threshold.")
            if genome:
                sys.stderr.write(
                    "Parameter genome ignored when threshold is specified.\n"
                    "Did you want to use fpr?\n")
            if filename:
                sys.stderr.write(
                    "Parameter filename ignored when threshold is specified.\n"
                    "Did you want to use fpr?\n")

        if genome and filename:
            raise ValueError("Need either genome or filename.")

        if fpr:
            fpr = float(fpr)
            if not (0.0 < fpr < 1.0):
                raise ValueError("Parameter fpr should be between 0 and 1")

        if not self.motifs:
            raise ValueError("please run set_motifs() first")

        thresholds = {}
        with open(self.motifs) as f:
            motifs = read_motifs(f)

        if threshold is not None:
            self.threshold = parse_threshold_values(self.motifs, threshold)
            return

        if filename:
            if not os.path.exists(filename):
                raise IOError("File {} does not exist.".format(filename))

            bg_hash = file_checksum(filename)
            seqs = Fasta(filename).seqs
        elif genome:
            bg_hash = "{}\{}".format(genome, int(length))
        else:
            raise ValueError("Need genome or filename")

        with Cache(CACHE_DIR) as cache:
            scan_motifs = []
            for motif in motifs:
                k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr)

                threshold = cache.get(k)
                if threshold is None:
                    scan_motifs.append(motif)
                else:
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    else:
                        thresholds[motif.id] = threshold

            if len(scan_motifs) > 0:
                if genome:
                    Genome(genome)
                    sys.stderr.write(
                        "Determining threshold for fpr {} and length {} based on {}\n"
                        .format(fpr, int(length), genome))
                    fa = RandomGenomicFasta(genome, length, 10000)
                    seqs = fa.seqs
                else:
                    sys.stderr.write(
                        "Determining threshold for fpr {} based on {}\n".
                        format(fpr, filename))
                for motif, threshold in self._threshold_from_seqs(
                        scan_motifs, seqs, fpr):
                    k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr)
                    cache.set(k, threshold)
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    else:
                        thresholds[motif.id] = threshold

        self.threshold_str = "{}_{}_{}_{}_{}".format(fpr, threshold, genome,
                                                     length, filename)
        self.threshold = thresholds