Ejemplo n.º 1
0
def pfmfile_location(infile):
    config = MotifConfig()

    if infile is None:
        infile = config.get_default_params().get("motif_db", None)
        if infile is None:
            raise ValueError(
                "No motif file was given and no default "
                "database specified in the config file."
            )

    if isinstance(infile, six.string_types):
        if not os.path.exists(infile):
            motif_dir = config.get_motif_dir()
            checkfile = os.path.join(motif_dir, infile)
            if os.path.exists(checkfile):
                infile = checkfile
            else:
                for ext in [".pfm", ".pwm"]:
                    if os.path.exists(checkfile + ext):
                        infile = checkfile + ext
                        break
            if not os.path.exists(infile):
                raise ValueError("Motif file {} not found".format(infile))

    return infile
Ejemplo n.º 2
0
def create_roc_plots(pfmfile, fgfa, background, outdir, genome):
    """Make ROC plots for all motifs."""
    motifs = read_motifs(pfmfile, fmt="pwm", as_dict=True)
    ncpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=ncpus)
    jobs = {}
    for bg, fname in background.items():
        for m_id, m in motifs.items():

            k = "{}_{}".format(str(m), bg)
            jobs[k] = pool.apply_async(get_roc_values,
                                       (motifs[m_id], fgfa, fname, genome))
    imgdir = os.path.join(outdir, "images")
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)

    roc_img_file = os.path.join(outdir, "images", "{}_roc.{}.png")

    for motif in motifs.values():
        for bg in background:
            k = "{}_{}".format(str(motif), bg)
            error, x, y = jobs[k].get()
            if error:
                logger.error("Error in thread: %s", error)
                logger.error("Motif: %s", motif)
                sys.exit(1)
            roc_plot(roc_img_file.format(motif.id, bg), x, y)
Ejemplo n.º 3
0
    def __init__(self, scale=True, ncpus=None):
        """Predict motif activities using Support Vector Regression.

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled
            before classification.

        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            SVR weights.
        """

        self.act_description = "activity values: SVR weights"

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus
        self.scale = scale
        self.act_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "regression"
Ejemplo n.º 4
0
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)

    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)
Ejemplo n.º 5
0
def scan_it_moods(infile,
                  motifs,
                  cutoff,
                  bgfile,
                  nreport=1,
                  scan_rc=True,
                  pvalue=None,
                  count=False):
    tmpdir = mkdtemp()
    matrices = []
    pseudocount = 1e-3
    # sys.stderr.write("bgfile: {}\n".format(bgfile))
    bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1)

    for motif in motifs:
        pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id))
        with open(pfmname, "w") as f:
            matrix = np.array(motif.pwm).transpose()
            for line in [" ".join([str(x) for x in row]) for row in matrix]:
                f.write("{}\n".format(line))

        matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount))

    thresholds = []
    if pvalue is not None:
        thresholds = [
            MOODS.tools.threshold_from_p(m, bg, float(pvalue))
            for m in matrices
        ]
        # sys.stderr.write("{}\n".format(thresholds))
    else:
        thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices]

    scanner = MOODS.scan.Scanner(7)
    scanner.set_motifs(matrices, bg, thresholds)

    config = MotifConfig()
    ncpus = int(config.get_default_params()["ncpus"])
    fa = Fasta(infile)
    chunk = 500
    if (len(fa) / chunk) < ncpus:
        chunk = len(fa) / (ncpus + 1)

    jobs = []
    func = scan_fa_with_motif_moods
    if count:
        func = scan_fa_with_motif_moods_count

    pool = mp.Pool()
    for i in range(0, len(fa), chunk):
        jobs.append(
            pool.apply_async(
                func,
                (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport,
                 scan_rc),
            ))

    for job in jobs:
        for ret in job.get():
            yield ret
Ejemplo n.º 6
0
class MotifProgram:
    from gimmemotifs.config import MotifConfig
    config = MotifConfig()

    def __init__(self):
        pass

    def bin(self):
        return self.config.bin(self.name)

    def dir(self):
        return self.config.dir(self.name)

    def is_configured(self):
        return self.config.is_configured(self.name)

    def is_installed(self):
        return self.is_configured() and os.access(self.bin(), os.X_OK)

    def run(self, fastafile, savedir, params={}):
        if not self.is_configured():
            raise ValueError, "%s is not configured" % self.name

        if not self.is_installed():
            raise ValueError, "%s is not installed or not correctly configured" % self.name

        try:
            return self._run_program(self.bin(), fastafile, savedir, params)
        except KeyboardInterrupt:
            return ([], "Killed", "Killed")
Ejemplo n.º 7
0
def get_genome(genomebuild, fastadir, indexdir=None):

    config = MotifConfig()
    if not indexdir:
        indexdir = config.get_index_dir()

    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(indexdir, genomebuild)

    
    # Check for rights to write to directory
    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except OSError:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)

    # Download annotation
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    download_annotation(genomebuild, gene_file)
    
    # Download genome FASTA file
    download_genome(genomebuild, genome_dir)

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)
    create_bedtools_fa(index_dir, genome_dir)
Ejemplo n.º 8
0
    def get_all_scores(self,
                       motifs,
                       dbmotifs,
                       match,
                       metric,
                       combine,
                       pval=False,
                       parallel=True,
                       trim=None,
                       ncpus=None):

        # trim motifs first, if specified
        if trim:
            for m in motifs:
                m.trim(trim)
            for m in dbmotifs:
                m.trim(trim)

        # hash of result scores
        scores = {}

        if parallel:
            # Divide the job into big chunks, to keep parallel overhead to minimum
            # Number of chunks = number of processors available
            if ncpus is None:
                ncpus = int(MotifConfig().get_default_params()["ncpus"])

            pool = Pool(processes=ncpus, maxtasksperchild=1000)

            batch_len = len(dbmotifs) // ncpus
            if batch_len <= 0:
                batch_len = 1
            jobs = []
            for i in range(0, len(dbmotifs), batch_len):
                # submit jobs to the job server

                p = pool.apply_async(_get_all_scores,
                                     args=(self, motifs,
                                           dbmotifs[i:i + batch_len], match,
                                           metric, combine, pval))
                jobs.append(p)

            pool.close()
            for job in jobs:
                # Get the job result
                result = job.get()
                # and update the result score
                for m1, v in result.items():
                    for m2, s in v.items():
                        if m1 not in scores:
                            scores[m1] = {}
                        scores[m1][m2] = s

            pool.join()
        else:
            # Do the whole thing at once if we don't want parallel
            scores = _get_all_scores(self, motifs, dbmotifs, match, metric,
                                     combine, pval)

        return scores
Ejemplo n.º 9
0
    def __init__(self, scale=True, permute=False, ncpus=None):
        """Predict motif activities using lightning CDClassifier 

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled 
            before classification
        
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.
       
        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted coefficients

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.act_description = ("activity values: coefficients from "
                                "fitted model")

        #self.cdc = CDClassifier(random_state=args.seed)
        self.cdc = CDClassifier()

        self.parameters = {
            "penalty": ["l1/l2"],
            "loss": ["squared_hinge"],
            "multiclass": [True],
            "max_iter": [20],
            "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)],
            "C": [0.001, 0.01, 0.1, 0.5, 1.0],
            "tol": [1e-3]
        }

        self.kfolds = 10

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))

        self.clf = GridSearchCV(self.cdc,
                                self.parameters,
                                cv=self.kfolds,
                                n_jobs=ncpus)

        self.scale = scale
        self.permute = permute

        self.act_ = None
        self.sig_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "classification"
Ejemplo n.º 10
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")

    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)

    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()

    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile,
            infile.replace(".bed", ".fa"),
        )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2

    genome.track2fasta(
        val_bedfile,
        os.path.join(outdir, "localization.fa"),
        extend_up=extend,
        extend_down=extend,
        stranded=params["use_strand"],
    )
Ejemplo n.º 11
0
    def __init__(self, ncpus=None):
        self.config = MotifConfig()
        self.threshold = None
        self.genome = None

        if ncpus is None:
            self.ncpus = int(MotifConfig().get_default_params()["ncpus"])
        else:
            self.ncpus = ncpus

        if self.ncpus > 1:
            try:
                ctx = mp.get_context('spawn')
                self.pool = ctx.Pool(processes=self.ncpus)
            except AttributeError:
                self.pool = mp.Pool(processes=self.ncpus)

        self.use_cache = False
        if self.config.get_default_params().get("use_cache", False):
            self._init_cache()
Ejemplo n.º 12
0
    def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None):
        """Predict motif activities using Lasso MultiTask regression

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled
            before classification

        kfolds : integer, optional, default 5
            number of kfolds for parameter search

        alpha_stepsize : float, optional, default 1.0
            stepsize for use in alpha gridsearch

        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted motif activities

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.kfolds = kfolds
        self.act_description = "activity values: coefficients from " "fitted model"

        self.scale = scale
        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus

        # initialize attributes
        self.act_ = None
        self.sig_ = None

        mtk = MultiTaskLasso()
        parameters = {
            "alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)]
        }
        self.clf = GridSearchCV(mtk,
                                parameters,
                                cv=kfolds,
                                n_jobs=self.ncpus,
                                scoring="r2")
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "regression"
Ejemplo n.º 13
0
def default_motifs():
    """Return list of Motif instances from default motif database."""
    config = MotifConfig()
    d = config.get_motif_dir()
    m = config.get_default_params()['motif_db']

    if not d or not m:
        raise ValueError("default motif database not configured")

    fname = os.path.join(d, m)
    with open(fname) as f:
        motifs = read_motifs(f)

    return motifs
Ejemplo n.º 14
0
    def __init__(self, ncpus=None):
        self.config = MotifConfig()
        self.threshold = None
        self.genome = None
        self.background = None
        self.meanstd = {}
        self.gc_bins = [(0, 1)]

        if ncpus is None:
            self.ncpus = int(MotifConfig().get_default_params()["ncpus"])
        else:
            self.ncpus = ncpus

        if self.ncpus > 1:
            # try:
            #    ctx = mp.get_context('spawn')
            #    self.pool = ctx.Pool(processes=self.ncpus)
            # except AttributeError:
            self.pool = mp.Pool(processes=self.ncpus)

        self.use_cache = False
        if self.config.get_default_params().get("use_cache", False):
            self._init_cache()
Ejemplo n.º 15
0
    def run(self):
        from gimmemotifs.config import MotifConfig

        cfg = MotifConfig(use_config=self.build_cfg)

        data_dir = self.remove_nonsense(os.path.abspath(self.install_dir))
        dlog.info("data_dir: {}".format(data_dir))
        cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates'))
        cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes'))
        cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists'))
        cfg.set_motif_dir(os.path.join(data_dir,
                                       'gimmemotifs/motif_databases'))
        cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg'))
        cfg.set_tools_dir(os.path.join(data_dir, 'gimmemotifs/tools'))

        final_tools_dir = self.remove_nonsense(self.install_tools_dir)
        for program in MOTIF_CLASSES:
            m = eval(program)()
            if cfg.is_configured(m.name):
                bin = cfg.bin(m.name).replace(
                    os.path.abspath(self.build_tools_dir), final_tools_dir)
                dir = cfg.dir(m.name)
                if dir:
                    dir = dir.replace(os.path.abspath(self.build_tools_dir),
                                      final_tools_dir)
                cfg.set_program(m.name, {"bin": bin, "dir": dir})

        dir = cfg.get_seqlogo()
        dir = dir.replace(os.path.abspath(self.build_tools_dir),
                          final_tools_dir)
        cfg.set_seqlogo(dir)

        # Use a user-specific configfile if any other installation scheme is used
        #        if os.path.abspath(self.install_dir) == "/usr/share":
        config_file = os.path.join(self.install_dir,
                                   "gimmemotifs/%s" % CONFIG_NAME)
        self.outfiles = [config_file]

        if os.path.exists(config_file):
            timestr = time.strftime("%Y%m%d-%H%M%S")
            old_config = "{}.{}".format(config_file, timestr)
            shutil.move(config_file, old_config)
            dlog.info("INFO: Configfile %s already existed!", config_file)
            dlog.info("INFO: This config has been saved as %s", old_config)

        dlog.info("writing configuration file %s" % config_file)
        f = open(config_file, "w")
        cfg.write(f)
Ejemplo n.º 16
0
def scan_to_table(input_table,
                  genome,
                  data_dir,
                  scoring,
                  pwmfile=None,
                  ncpus=None):
    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR, genome=genome)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Ejemplo n.º 17
0
def maelstrom_html_report(outdir, infile, pwmfile=None, threshold=2):
    df = pd.read_table(infile, index_col=0)
    df = df[np.any(abs(df) >= threshold, 1)]
    M = max(abs(df.min().min()), df.max().max())
    m = -M

    if pwmfile:
        with open(pwmfile) as f:
            motifs = read_motifs(f)
    else:
        motifs = default_motifs()

    del df.index.name
    cols = df.columns
    m2f = dict([(m.id,",".join(m.factors)) for m in motifs])

    df["factors"] = [m2f.get(m, "") for m in df.index]
    f = df["factors"].str.len() > 30
    df["factors"] = '<div title="' + df["factors"] + '">' + df["factors"].str.slice(0,30)
    df.loc[f, "factors"] += '(...)'
    df['factors'] += '</div>'

    df["logo"] = ['<img src="logos/{}.png" height=40/>'.format(x) for x in list(df.index)]

    if not os.path.exists(outdir + "/logos"):
        os.makedirs(outdir + "/logos")
    for motif in motifs:
        if motif.id in df.index:
            motif.to_img(outdir + "/logos/{}.png".format(motif.id), fmt="PNG")

    template_dir = MotifConfig().get_template_dir()
    js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read()
    css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read()
    cm = sns.diverging_palette(240, 10, as_cmap=True)
    df = df[["factors", "logo"] + list(cols)]
    with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
        f.write("<head>\n")
        f.write("<style>{}</style>\n".format(css))
        f.write("</head>\n")
        f.write("<body>\n")

        f.write(df.style.apply(background_gradient, low=0.7, high=0.7, m=m, M=M, subset=cols).set_precision(3).set_table_attributes("data-sortable").render().replace("data-sortable", 'class="sortable-theme-slick" data-sortable'))

        f.write("<script>{}</script>\n".format(js))
        f.write("</body>\n")
Ejemplo n.º 18
0
    def __init__(self, name=None):
        self.config = MotifConfig()
        self.server = None

        if not name:
            name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y"))
        self.name = name

        # create a directory for all the intermediate and output files
        self._setup_output_dir(name)

        # setup logging
        self._setup_logging()
        self.logger.info("%s version %s", self.NAME, GM_VERSION)
        self.logger.info("output dir: %s", self.outdir)

        # setup the names of the intermediate and output files
        self._setup_filenames()
Ejemplo n.º 19
0
def _write_report(outdir, ids, tree, clusters):
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result)

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for motif_id in ids:
        f.write("%s\t%s\n" % (motif_id[0], ",".join([x["alt"] for x in motif_id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
Ejemplo n.º 20
0
    def __init__(self, scale=True, cv=3, ncpus=None):
        """Predict motif activities using lightning CDRegressor 

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled 
            before classification
       
        cv : int, optional, default 3
            Cross-validation k-fold parameter.
        
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted coefficients

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.act_description = ("activity values: coefficients from "
                                "fitted model")

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus
        self.kfolds = cv
        self.scale = scale

        self.act_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "regression"
Ejemplo n.º 21
0
    def __init__(self, ncpus=None):
        """Predict motif activities using a random forest classifier

        Parameters
        ----------
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.

        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            feature importances from the model

        """
        self.act_ = None
        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
        self.ncpus = ncpus
        self.act_description = ("activity values: feature importances "
                                "from fitted Random Forest model")
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "classification"
Ejemplo n.º 22
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")
    
    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000) 
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
Ejemplo n.º 23
0
class MotifProgram(object):
    config = MotifConfig()
    local_bin = None

    def __init__(self):
        pass

    def bin(self):
        if self.local_bin:
            return self.local_bin
        else:
            return self.config.bin(self.name)

    def dir(self):
        return self.config.dir(self.name)

    def is_configured(self):
        return self.config.is_configured(self.name)

    def is_installed(self):
        return self.is_configured() and os.access(self.bin(), os.X_OK)

    def run(self, fastafile, savedir, params=None, tmp=None):

        if not self.is_configured():
            raise ValueError("%s is not configured" % self.name)

        if not self.is_installed():
            raise ValueError(
                "%s is not installed or not correctly configured" % self.name)

        self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp)

        try:
            return self._run_program(self.bin(), fastafile, savedir, params)
        except KeyboardInterrupt:
            return ([], "Killed", "Killed")
Ejemplo n.º 24
0
def check_threshold(outdir, genome, scoring="count"):
    # gimme_motifs config, to get defaults
    config = MotifConfig()

    threshold_file = None
    if scoring == "count":
        # Motif scanning threshold
        threshold_file = os.path.join(outdir,
                                      "threshold.{}.txt".format(genome))
        if not os.path.exists(threshold_file):
            # Random sequences from genome
            index_dir = os.path.join(config.get_index_dir(), genome)
            bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
            if not os.path.exists(bg_file):
                m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
                m.writefasta(bg_file)

            pwmfile = config.get_default_params().get("motif_db")
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

            cmd = "gimme threshold {} {} {} > {}".format(
                pwmfile, bg_file, FDR, threshold_file)
            sp.call(cmd, shell=True)
        return threshold_file
Ejemplo n.º 25
0
def create_background_file(outfile,
                           bg_type,
                           fmt="fasta",
                           size=None,
                           genome=None,
                           inputfile=None,
                           number=10000):
    """
    Create a background file for motif analysis.

    Parameters
    ----------
    outfile : str
        Name of the output file.
    bg_type : str
        Type of background (gc, genomic, random or promoter).
    fmt : str, optional
        Either 'fasta' or 'bed'.
    size : int, optional
        Size of the generated sequences, is determined from the inputfile if not
        given.
    genome : str, optional
    inputfile : str, optional
    number : int, optional
    """
    fmt = fmt.lower()
    if fmt in ["fa", "fsa"]:
        fmt = "fasta"

    if bg_type not in BG_TYPES:
        print("The argument 'type' should be one of: %s" %
              (",".join(BG_TYPES)))
        sys.exit(1)

    if fmt == "bed" and bg_type == "random":
        print("Random background can only be generated in FASTA format!")
        sys.exit(1)

    if bg_type == "gc" and not inputfile:
        print("need a FASTA formatted input file for background gc")
        sys.exit(1)

    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()

    # Genome index location for creation of FASTA files
    if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta":
        if genome is None:
            print("Need a genome to create background file")
            sys.exit(1)
        Genome(genome)

    if bg_type in ["promoter"]:
        # Gene definition
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(),
                                     "{}.bed".format(genome))

        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}".format(genome))
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            sys.exit(1)

    # Number of sequences
    if number is None:
        if inputfile:
            number = number_of_seqs_in_file(inputfile)
            logger.info("Using %s of background sequences based on input file",
                        number)
        else:
            number = 10000
            logger.info(
                "Number of background sequences not specified, using 10,000 sequences"
            )

    if bg_type == "random":
        f = Fasta(inputfile)
        m = MarkovFasta(f, n=number, k=1)
        m.writefasta(outfile)
    elif bg_type == "gc":
        if fmt == "fasta":
            m = MatchedGcFasta(inputfile, genome, number=number, size=size)
            m.writefasta(outfile)
        else:
            matched_gc_bedfile(outfile, inputfile, genome, number, size=size)
    else:
        if size is None:
            size = np.median(
                [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs])
        if bg_type == "promoter":
            if fmt == "fasta":
                m = PromoterFasta(gene_file, genome, size=size, n=number)
                m.writefasta(outfile)
            else:
                create_promoter_bedfile(outfile, gene_file, size, number)
        elif bg_type == "genomic":
            if fmt == "fasta":
                m = RandomGenomicFasta(genome, size, number)
                m.writefasta(outfile)
            else:
                create_random_genomic_bedfile(outfile, genome, size, number)
Ejemplo n.º 26
0
def cluster(args):

    revcomp = not args.single

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    trim_ic = 0.2
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True)
        clusters = tree.getResult()
    
    ids = []
    mc = MotifComparer()

    sys.stderr.write("Creating images\n")
    for cluster,members in clusters:
        cluster.trim(trim_ic)
        cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG")
        ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)    
            add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos
                
                if strand in [1,"+"]:
                    pass
                else:
                    #print "RC %s" % motif.id
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)    
                motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]
    
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result.encode('utf-8'))

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for id in ids:
        f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
Ejemplo n.º 27
0
def pp_predict_motifs(fastafile,
                      outfile,
                      analysis="small",
                      organism="hg18",
                      single=False,
                      background="",
                      tools=None,
                      job_server=None,
                      ncpus=8,
                      max_time=None,
                      stats_fg=None,
                      stats_bg=None):
    """Parallel prediction of motifs.

    Utility function for gimmemotifs.denovo.gimme_motifs. Probably better to 
    use that, instead of this function directly.
    """
    if tools is None:
        tools = {}

    config = MotifConfig()

    if not tools:
        tools = dict([(x, 1)
                      for x in config.get_default_params["tools"].split(",")])

    #logger = logging.getLogger('gimme.prediction.pp_predict_motifs')

    wmin = 5
    step = 1
    if analysis in ["large", "xl"]:
        step = 2
        wmin = 6

    analysis_max = {"xs": 5, "small": 8, "medium": 10, "large": 14, "xl": 20}
    wmax = analysis_max[analysis]

    if analysis == "xs":
        sys.stderr.write("Setting analysis xs to small")
        analysis = "small"

    if not job_server:
        job_server = pool

    jobs = {}

    result = PredictionResult(
        outfile,
        fg_file=stats_fg,
        background=stats_bg,
        job_server=job_server,
    )

    # Dynamically load all tools
    toolio = [
        x[1]() for x in inspect.getmembers(
            tool_classes, lambda x: inspect.isclass(x) and issubclass(
                x, tool_classes.MotifProgram)) if x[0] != 'MotifProgram'
    ]

    # TODO:
    # Add warnings for running time: Weeder, GADEM

    ### Add all jobs to the job_server ###
    params = {
        'analysis': analysis,
        'background': background,
        "single": single,
        "organism": organism
    }

    # Tools that don't use a specified width usually take longer
    # ie. GADEM, XXmotif, MEME
    # Start these first.
    for t in [tool for tool in toolio if not tool.use_width]:
        if t.name in tools and tools[t.name]:
            logger.debug("Starting %s job", t.name)
            job_name = t.name
            jobs[job_name] = job_server.apply_async(
                _run_tool, (job_name, t, fastafile, params),
                callback=result.add_motifs)
        else:
            logger.debug("Skipping %s", t.name)

    for t in [tool for tool in toolio if tool.use_width]:
        if t.name in tools and tools[t.name]:
            for i in range(wmin, wmax + 1, step):
                logger.debug("Starting %s job, width %s", t.name, i)
                job_name = "%s_width_%s" % (t.name, i)
                my_params = params.copy()
                my_params['width'] = i
                jobs[job_name] = job_server.apply_async(
                    _run_tool, (job_name, t, fastafile, my_params),
                    callback=result.add_motifs)
        else:
            logger.debug("Skipping %s", t.name)

    logger.info("all jobs submitted")
    for job in jobs.values():
        job.get()

    result.wait_for_stats()
    ### Wait until all jobs are finished or the time runs out ###
    #    start_time = time()
    #    try:
    #        # Run until all jobs are finished
    #        while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time):
    #            pass
    #        if len(result.finished) < len(jobs.keys()):
    #            logger.info("Maximum allowed running time reached, destroying remaining jobs")
    #            job_server.terminate()
    #            result.submit_remaining_stats()
    #    ### Or the user gets impatient... ###
    #    except KeyboardInterrupt:
    #        # Destroy all running jobs
    #        logger.info("Caught interrupt, destroying all running jobs")
    #        job_server.terminate()
    #        result.submit_remaining_stats()
    #
    #
    #    if stats_fg and stats_bg:
    #        logger.info("waiting for motif statistics")
    #        n = 0
    #        last_len = 0
    #
    #
    #        while len(set(result.stats.keys())) < len(set([str(m) for m in result.motifs])):
    #            if n >= 30:
    #                logger.debug("waited long enough")
    #                logger.debug("motifs: %s, stats: %s", len(result.motifs), len(result.stats.keys()))
    #                for i,motif in enumerate(result.motifs):
    #                    if "{}_{}".format(motif.id, motif.to_consensus()) not in result.stats:
    #                        logger.debug("deleting %s", motif)
    #                        del result.motifs[i]
    #                break
    #            sleep(2)
    #            if len(result.stats.keys()) == last_len:
    #                n += 1
    #            else:
    #                last_len = len(result.stats.keys())
    #                n = 0
    #
    return result
Ejemplo n.º 28
0
class MotifProgram(object):
    """Motif program base class."""

    config = MotifConfig()
    local_bin = None

    def _parse_params(self, params=None, needs_background=False):
        """
        Parse parameters.

        Combine default and user-defined parameters.
        """
        prm = self.default_params.copy()
        if params is not None:
            prm.update(params)

        # Background file is essential!
        if "background" in prm:
            # Absolute path, just to be sure
            prm["background"] = os.path.abspath(prm["background"])
        elif needs_background:
            raise ValueError("Background file needed!")

        return prm

    def _read_and_label_motifs(self, outfile, stdout, stderr, fmt="meme"):
        """Read output motifs and label with program name"""
        if not os.path.exists(outfile):
            stdout += "\nMotif file {0} not found!\n".format(outfile)
            stderr += "\nMotif file {0} not found!\n".format(outfile)
            return [], stdout, stderr

        motifs = read_motifs(outfile, fmt="meme")
        for m in motifs:
            m.id = "{0}_{1}".format(self.name, m.id)
        return motifs, stdout, stderr

    def bin(self):
        """
        Get the command used to run the tool.

        Returns
        -------
        command : str
            The tool system command.
        """
        if self.local_bin:
            return self.local_bin
        else:
            return self.config.bin(self.name)

    def dir(self):
        """
        Get the installation directory of the tool.

        Returns
        -------
        dir : str
            The tool directory.
        """
        return self.config.dir(self.name)

    def is_configured(self):
        """
        Check if the tool is configured.

        Returns
        -------
        is_configured : bool
            True if the tool is configured.
        """
        return self.config.is_configured(self.name)

    def is_installed(self):
        """
        Check if the tool is installed.

        Returns
        -------
        is_installed : bool
            True if the tool is installed.
        """
        return self.is_configured() and os.access(self.bin(), os.X_OK)

    def run(self, fastafile, params=None, tmp=None):
        """
        Run the tool and predict motifs from a FASTA file.

        Parameters
        ----------
        fastafile : str
            Name of the FASTA input file.

        params : dict, optional
            Optional parameters. For some of the tools required parameters
            are passed using this dictionary.

        tmp : str, optional
            Directory to use for creation of temporary files.

        Returns
        -------
        motifs : list of Motif instances
            The predicted motifs.

        stdout : str
            Standard out of the tool.

        stderr : str
            Standard error of the tool.
        """
        if not self.is_configured():
            raise ValueError("%s is not configured" % self.name)

        if not self.is_installed():
            raise ValueError(
                "%s is not installed or not correctly configured" % self.name)

        self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp)
        fastafile = os.path.abspath(fastafile)

        try:
            return self._run_program(self.bin(), fastafile, params)
        except KeyboardInterrupt:
            return ([], "Killed", "Killed")
Ejemplo n.º 29
0
def scan_to_table(
    input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True
):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    scoring : str
        "count" or "score"

    pfmfile : str, optional
        Specify a PFM file for scanning.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()

    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        if pfmfile is not None:
            pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    if pfmfile is None:
        raise ValueError("no pfmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    if len(regions) >= 1000:
        check_regions = np.random.choice(regions, size=1000, replace=False)
    else:
        check_regions = regions

    size = int(
        np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs])
    )
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)
    s.set_genome(genome)
    s.set_background(genome=genome, gc=gc, size=size)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        msg = "creating score table"
        if zscore:
            msg += " (z-score"
            if gc:
                msg += ", GC%"
            msg += ")"
        else:
            msg += " (logodds)"
        logger.info(msg)
        for row in s.best_score(regions, zscore=zscore, gc=gc):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(pfmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Ejemplo n.º 30
0
def create_background(
    bg_type,
    fafile,
    outfile,
    genome="hg18",
    size=200,
    nr_times=10,
    custom_background=None,
):
    """Create background of a specific type.

    Parameters
    ----------
    bg_type : str
        Name of background type.

    fafile : str
        Name of input FASTA file.

    outfile : str
        Name of output FASTA file.

    genome : str, optional
        Genome name.

    size : int, optional
        Size of regions.

    nr_times : int, optional
        Generate this times as many background sequences as compared to
        input file.

    Returns
    -------
    nr_seqs  : int
        Number of sequences created.
    """
    size = int(size)
    config = MotifConfig()
    fg = Fasta(fafile)

    if bg_type in ["genomic", "gc"]:
        if not genome:
            logger.error("Need a genome to create background")
            sys.exit(1)

    if bg_type == "random":
        f = MarkovFasta(fg, k=1, n=nr_times * len(fg))
        logger.debug("Random background: %s", outfile)
    elif bg_type == "genomic":
        logger.debug("Creating genomic background")
        f = RandomGenomicFasta(genome, size, nr_times * len(fg))
    elif bg_type == "gc":
        logger.debug("Creating GC matched background")
        f = MatchedGcFasta(fafile, genome, nr_times * len(fg))
        logger.debug("GC matched background: %s", outfile)
    elif bg_type == "promoter":
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome)
        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}")
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            raise ValueError()

        logger.info(
            "Creating random promoter background (%s, using genes in %s)",
            genome,
            gene_file,
        )
        f = PromoterFasta(gene_file, genome, size, nr_times * len(fg))
        logger.debug("Random promoter background: %s", outfile)
    elif bg_type == "custom":
        bg_file = custom_background
        if not bg_file:
            raise IOError("Background file not specified!")

        if not os.path.exists(bg_file):
            raise IOError("Custom background file %s does not exist!", bg_file)
        else:
            logger.info("Copying custom background file %s to %s.", bg_file,
                        outfile)
            f = Fasta(bg_file)
            median_length = np.median([len(seq) for seq in f.seqs])
            if median_length < (size * 0.95) or median_length > (size * 1.05):
                logger.warn(
                    "The custom background file %s contains sequences with a "
                    "median size of %s, while GimmeMotifs predicts motifs in sequences "
                    "of size %s. This will influence the statistics! It is recommended "
                    "to use background sequences of the same size.",
                    bg_file,
                    median_length,
                    size,
                )

    f.writefasta(outfile)
    return len(f)