コード例 #1
0
    def scan(self, seqs, nreport=100, scan_rc=True, normalize=False):
        """
        scan a set of regions / sequences
        """

        if not self.threshold:
            sys.stderr.write("Using default threshold of 0.95. "
                             "This is likely not optimal!\n")
            self.set_threshold(threshold=0.95)

        seqs = as_fasta(seqs, genome=self.genome)

        it = self._scan_sequences(seqs.seqs, nreport, scan_rc)

        if normalize:
            if len(self.meanstd) == 0:
                self.set_meanstd()
            mean_std = [self.meanstd.get(m_id) for m_id in self.motif_ids]
            means = [x[0] for x in mean_std]
            stds = [x[1] for x in mean_std]

        for result in it:
            if normalize:
                zresult = []
                for i, mrow in enumerate(result):
                    mrow = [((x[0] - means[i]) / stds[i], x[1], x[2])
                            for x in mrow]
                    zresult.append(mrow)
                yield zresult
            else:
                yield result
コード例 #2
0
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, 
        bed=False, scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None, ncpus=None, normalize=False):
    motifs = read_motifs(pwmfile)
    
    fa = as_fasta(inputfile, genome)
    
    # initialize scanner
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    
    if genome:
        s.set_genome(genome=genome)

    if genome or bgfile:
        s.set_background(genome=genome, fname=bgfile, length=fa.median_length())

    if not score_table:
        s.set_threshold(fpr=fpr, threshold=cutoff)
    
    if table:
        it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods)
    elif score_table:
        it = scan_score_table(s, fa, motifs, scan_rc, normalize=normalize) 
    else:
        it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, normalize=normalize)
    
    for row in it:
        yield row
コード例 #3
0
    def get_PWMScore(self, fin_regions_fa):
        """ Scan motif in every peak.

        Arguments:
            fin_regions_fa {[type]} -- [input fasta file]

        Returns:
            [type] -- [pfmscorefile]
        """
        pfmscorefile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
        seqs = [s.split(" ")[0] for s in as_fasta(fin_regions_fa, genome=self.genome).ids]

        s = Scanner(ncpus=self.ncore)
        s.set_motifs(self.pfmfile)
        s.set_threshold(threshold=0.0)
        s.set_genome(self.genome)

        with open(self.pfmfile) as f:
            motifs = read_motifs(f)

        chunksize = 10000
        # Run 10k peaks one time.

        with tqdm(total=len(seqs)) as pbar:
            for chunk in range(0, len(seqs), chunksize):
                chunk_seqs = seqs[chunk : chunk + chunksize]
                # print(chunk, "-", chunk + chunksize, "enhancers")
                pfm_score = []
                it = s.best_score(chunk_seqs, zscore=True, gc=True)
                # We are using GC-normalization for motif scan because many sequence is GC-enriched.
                # GimmeMotif develop branch already include GC-normalization option now.
                for seq, scores in zip(chunk_seqs, it):
                    for motif, score in zip(motifs, scores):
                        pfm_score.append([motif.id, seq, score])
                    pbar.update(1)
                pfm_score = pd.DataFrame(pfm_score, columns=["motif", "enhancer", "zscore"])
                pfm_score = pfm_score.set_index("motif")

                # print("\tCombine")
                pfm_score["zscoreRank"] = minmax_scale(rankdata(pfm_score["zscore"]))
                # When we built model, rank and minmax normalization was used.
                cols = ["enhancer", "zscore", "zscoreRank"]
                write_header = False
                if chunk == 0:
                    write_header = True
                pfm_score[cols].to_csv(pfmscorefile, sep="\t", header=write_header)
                # pbar.update(chunk + chunksize)

        return pfmscorefile.name
コード例 #4
0
    def scan(self, seqs, nreport=100, scan_rc=True):
        """
        scan a set of regions / sequences
        """

        if not self.threshold:
            sys.stderr.write("Using default threshold of 0.95. "
                             "This is likely not optimal!\n")
            self.set_threshold(threshold=0.95)

        seqs = as_fasta(seqs, genome=self.genome)

        it = self._scan_sequences(seqs.seqs, nreport, scan_rc)

        for result in it:
            yield result
コード例 #5
0
ファイル: scanner.py プロジェクト: jimwry/gimmemotifs
    def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False):
        """
        Scan a set of regions or sequences.
        """

        if not self.threshold:
            logger.info("Using default threshold of 0.95. "
                        "This is likely not optimal!")
            self.set_threshold(threshold=0.95)

        seqs = as_fasta(seqs, genome=self.genome)

        it = self._scan_sequences(seqs.seqs, nreport, scan_rc)

        if zscore:
            if gc:
                if len(self.meanstd) <= 1:
                    self.set_meanstd(gc=gc)
            else:
                if len(self.meanstd) != 1:
                    self.set_meanstd(gc=gc)

        gc_seqs = [self.get_seq_bin(seq) for seq in seqs.seqs]

        logger.debug("Scanning")
        for result, gc_seq in zip(it, gc_seqs):
            if zscore:
                zresult = []
                for i, mrow in enumerate(result):
                    try:
                        m_mean, m_std = self.get_motif_mean_std(
                            gc_seq, self.motif_ids[i])
                    except Exception:
                        print(self.meanstd)
                        print(gc_seq, self.motif_ids[i])
                        raise
                    mrow = [((x[0] - m_mean) / m_std, x[1], x[2])
                            for x in mrow]
                    zresult.append(mrow)
                yield zresult
            else:
                yield result
コード例 #6
0
    def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False):
        """
        Scan a set of regions or sequences.
        """
        seqs = as_fasta(seqs, genome=self.genome)
        if zscore:
            if gc:
                if len(self.meanstd) <= 1:
                    self.set_meanstd(gc=gc)
            else:
                if len(self.meanstd) != 1:
                    self.set_meanstd(gc=gc)

        batch_size = 10000
        logger.debug("Scanning")
        for batch_idx in range(0, len(seqs), batch_size):
            it = self._scan_sequences(
                seqs.seqs[batch_idx:batch_idx + batch_size],
                nreport,
                scan_rc,
                zscore=zscore,
            )
            for result in it:
                yield result
コード例 #7
0
def create_background_file(outfile,
                           bg_type,
                           fmt="fasta",
                           size=None,
                           genome=None,
                           inputfile=None,
                           number=10000):
    """
    Create a background file for motif analysis.

    Parameters
    ----------
    outfile : str
        Name of the output file.
    bg_type : str
        Type of background (gc, genomic, random or promoter).
    fmt : str, optional
        Either 'fasta' or 'bed'.
    size : int, optional
        Size of the generated sequences, is determined from the inputfile if not
        given.
    genome : str, optional
    inputfile : str, optional
    number : int, optional
    """
    fmt = fmt.lower()
    if fmt in ["fa", "fsa"]:
        fmt = "fasta"

    if bg_type not in BG_TYPES:
        print("The argument 'type' should be one of: %s" %
              (",".join(BG_TYPES)))
        sys.exit(1)

    if fmt == "bed" and bg_type == "random":
        print("Random background can only be generated in FASTA format!")
        sys.exit(1)

    if bg_type == "gc" and not inputfile:
        print("need a FASTA formatted input file for background gc")
        sys.exit(1)

    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()

    # Genome index location for creation of FASTA files
    if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta":
        if genome is None:
            print("Need a genome to create background file")
            sys.exit(1)
        Genome(genome)

    if bg_type in ["promoter"]:
        # Gene definition
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(),
                                     "{}.bed".format(genome))

        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}".format(genome))
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            sys.exit(1)

    # Number of sequences
    if number is None:
        if inputfile:
            number = number_of_seqs_in_file(inputfile)
            logger.info("Using %s of background sequences based on input file",
                        number)
        else:
            number = 10000
            logger.info(
                "Number of background sequences not specified, using 10,000 sequences"
            )

    if bg_type == "random":
        f = Fasta(inputfile)
        m = MarkovFasta(f, n=number, k=1)
        m.writefasta(outfile)
    elif bg_type == "gc":
        if fmt == "fasta":
            m = MatchedGcFasta(inputfile, genome, number=number, size=size)
            m.writefasta(outfile)
        else:
            matched_gc_bedfile(outfile, inputfile, genome, number, size=size)
    else:
        if size is None:
            size = np.median(
                [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs])
        if bg_type == "promoter":
            if fmt == "fasta":
                m = PromoterFasta(gene_file, genome, size=size, n=number)
                m.writefasta(outfile)
            else:
                create_promoter_bedfile(outfile, gene_file, size, number)
        elif bg_type == "genomic":
            if fmt == "fasta":
                m = RandomGenomicFasta(genome, size, number)
                m.writefasta(outfile)
            else:
                create_random_genomic_bedfile(outfile, genome, size, number)
コード例 #8
0
def scan_to_table(
    input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True
):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    scoring : str
        "count" or "score"

    pfmfile : str, optional
        Specify a PFM file for scanning.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()

    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        if pfmfile is not None:
            pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    if pfmfile is None:
        raise ValueError("no pfmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    if len(regions) >= 1000:
        check_regions = np.random.choice(regions, size=1000, replace=False)
    else:
        check_regions = regions

    size = int(
        np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs])
    )
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)
    s.set_genome(genome)
    s.set_background(genome=genome, gc=gc, size=size)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        msg = "creating score table"
        if zscore:
            msg += " (z-score"
            if gc:
                msg += ", GC%"
            msg += ")"
        else:
            msg += " (logodds)"
        logger.info(msg)
        for row in s.best_score(regions, zscore=zscore, gc=gc):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(pfmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
コード例 #9
0
ファイル: scanner.py プロジェクト: jimwry/gimmemotifs
    def set_background(self,
                       fname=None,
                       genome=None,
                       size=200,
                       nseq=10000,
                       gc=False,
                       gc_bins=None):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the
        name of a FASTA file.

        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        size : int, optional
            Size of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        if self.background:
            return

        size = int(size)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.debug("using background: genome {} with size {}".format(
            genome, size))
        lock.acquire()
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "d{}:{}:{}:{}".format(
                genome, int(size), gc, str(gc_bins))
            c = cache.get(self.background_hash)
            if c:
                fa, gc_bins = c
            else:
                fa = None

            if not fa:
                if gc:

                    if gc_bins is None:
                        gc_bins = [(0.0, 0.2), (0.8, 1)]
                        for b in np.arange(0.2, 0.799, 0.05):
                            gc_bins.append((b, b + 0.05))

                    with NamedTemporaryFile() as tmp:
                        logger.info("using {} sequences".format(nseq))
                        gc_bin_bedfile(tmp.name,
                                       genome,
                                       number=nseq,
                                       length=size,
                                       bins=gc_bins)
                        fa = as_fasta(tmp.name, genome=genome)
                else:
                    fa = RandomGenomicFasta(genome, size, nseq)
                cache.set(self.background_hash, (fa, gc_bins))
        lock.release()

        self.background = fa
        if gc_bins:
            self.gc_bins = gc_bins
コード例 #10
0
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, 
        scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None):
    motifs = pwmfile_to_motifs(pwmfile)
    
    index_dir = None
    if genome is not None:
        index_dir = os.path.join(MotifConfig().get_index_dir(), genome) 
    
    # initialize scanner
    s = Scanner()
    s.set_motifs(pwmfile)
    
    fa = as_fasta(inputfile, index_dir)
    
    if moods:
        result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table)
    else:
        result_it = s.scan(fa, nreport, scan_rc, cutoff)

    
    if table:
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        
        if moods:
            result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile,  nreport, scan_rc, pvalue, table)
            for seq_id, counts in result_it:
                yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts]))
        else:
            # get iterator
            result_it = s.count(fa, nreport, scan_rc, cutoff)
            # counts table
            for i, counts in enumerate(result_it):
                yield "{}\t{}".format(
                        fa.ids[i], 
                        "\t".join([str(x) for x in counts])
                        )

    elif score_table:
        # get iterator
        result_it = s.best_score(fa, scan_rc)
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        # score table
        for i,scores in enumerate(result_it):
            yield "{}\t{}".format(
                    fa.ids[i], 
                    "\t".join([str(x) for x in scores])
                    )

    else:
        if moods:
            for motif, d in result_it:
                for seq_id,matches in d.items():
                    for pos,score,strand in matches:
                        yield format_line(fa, seq_id, motif,
                                score, pos, strand, bed=bed)
        else:
            for i, result in enumerate(result_it):
                seq_id = fa.ids[i]
                for motif, matches in zip(motifs, result):
                    for (score, pos, strand) in matches:
                        yield format_line(fa, seq_id, motif, 
                                   score, pos, strand, bed=bed)