Example #1
0
def start_cluster(cluster_config):
    global cluster, view, client, direct_view
    cluster = Cluster(**cluster_config["cluster"])
    logger.info("Starting the cluster with %d nodes." % (cluster.n))
    cluster.start()
    sleep(cluster.delay)

    # only continue when the cluster is completely up
    slept = 0
    while (not cluster.is_up()):
        sleep(cluster.delay)
        slept = slept + cluster.delay
        if (slept > cluster_config["cluster"].get("timeout",
                                                  DEFAULT_CLUSTER_TIMEOUT)):
            logger.error("Cluster startup timed out.")
            cluster.stop()
            exit(-1)
    # only continue if at least one engine is up

    logger.info("Cluster up.")
    client = cluster.client()
    view = cluster.view()
    direct_view = cluster.direct_view()
    engine_config = cluster_config.copy()
    engine_config["engine_log"] = True
    direct_view['config'] = engine_config
    direct_view.execute('from bipy.log import setup_logging')
    direct_view.execute('setup_logging(config)')
Example #2
0
def _get_gtf(config):
    gtf = config["annotation"].get("file", None)
    #gtf = config.get("gtf", None)
    if not gtf or not file_exists(gtf):
        logger.error("genebody_coverage needs a GTF file passed to it.")
        exit(1)
    return gtf
Example #3
0
def start_cluster(cluster_config):
    global cluster, view, client, direct_view
    cluster = Cluster(**cluster_config["cluster"])
    logger.info("Starting the cluster with %d nodes." % (cluster.n))
    cluster.start()
    sleep(cluster.delay)

    # only continue when the cluster is completely up
    slept = 0
    while not cluster.is_up():
        sleep(cluster.delay)
        slept = slept + cluster.delay
        if slept > cluster_config["cluster"].get("timeout", DEFAULT_CLUSTER_TIMEOUT):
            logger.error("Cluster startup timed out.")
            cluster.stop()
            exit(-1)
    # only continue if at least one engine is up

    logger.info("Cluster up.")
    client = cluster.client()
    view = cluster.view()
    direct_view = cluster.direct_view()
    engine_config = cluster_config.copy()
    engine_config["engine_log"] = True
    direct_view["config"] = engine_config
    direct_view.execute("from bipy.log import setup_logging")
    direct_view.execute("setup_logging(config)")
def _make_current_files(curr_files):
    """ makes sure the list of files is non zero and exists """
    for curr_file in curr_files:
        if not file_exists(curr_file):
            logger.error("%s does not exist or is size 0. Aborting."
                         % (curr_file))
            exit(1)
    return curr_files
Example #5
0
def _make_current_files(curr_files):
    """ makes sure the list of files is non zero and exists """
    for curr_file in curr_files:
        if not file_exists(curr_file):
            logger.error("%s does not exist or is size 0. Aborting." %
                         (curr_file))
            exit(1)
    return curr_files
Example #6
0
 def _add_entry(d, v):
     base = os.path.basename(v)
     k = base.split(delimiter)[0]
     if "PbN" in k:
         d["PbN"] = d.get("PbN", []) + [v]
     elif "Pb" in k:
         d["Pb"] = d.get("Pb", []) + [v]
     else:
         logger.error("Error grouping by cell type")
         exit(-1)
     return d
Example #7
0
 def _add_entry(d, v):
     base = os.path.basename(v)
     k = base.split(delimiter)[0]
     if "PbN" in k:
         d["PbN"] = d.get("PbN", []) + [v]
     elif "Pb" in k:
         d["Pb"] = d.get("Pb", []) + [v]
     else:
         logger.error("Error grouping by cell type")
         exit(-1)
     return d
Example #8
0
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program", "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cutadapt(in_file,
                     self.options,
                     adapters,
                     quality_base=quality_base,
                     _out=temp_cut.name)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cutadapt(temp_cut.name,
                         self.options,
                         "-a",
                         polya,
                         "-a",
                         self._rc_adapters(polya),
                         quality_base=quality_base,
                         _out=temp_out)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cutadapt(in_file, self.options, adapters, _out=temp_out)
            return out_file
Example #9
0
def _fetch_chrom_sizes(config):

    PROGRAM = "fetchChromSizes"

    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if "annotation" not in config:
        logger.error("'annotation' must be in the yaml file. See example "
                     " configuration files")
        exit(1)
    if "name" not in config["annotation"]:
        logger.error("'name' must be in the yaml file under  "
                     " 'annotation'. See example configuration files.")
        exit(1)
    genome = config["annotation"]["name"]
    chrom_size_file = os.path.join(_results_dir(config), genome + ".sizes")
    if file_exists(chrom_size_file):
        return chrom_size_file

    with file_transaction(chrom_size_file) as tmp_chrom_size_file:
        sh.fetchChromSizes(genome, _out=tmp_chrom_size_file)

    if not file_exists(chrom_size_file):
        logger.error("chromosome size file does not exist. Check "
                     "'annotation': 'name' to make sure it is valid.")
        exit(1)
    return chrom_size_file
Example #10
0
def _validate_config(in_file, stage_config, config):
    """ validates that a set of assumptions about the config file
    needed to run the program are true """
    if "ref" not in config:
        logger.error("ref: must appear in the config file")
        exit(1)
    if not file_exists(config["ref"] + ".fa"):
        logger.error("%s not found, aborting." % (config["ref_fasta"]))
    if not file_exists(in_file):
        logger.error("%s not found, aborting." % (in_file))
    if not file_exists(config["gtf"]):
        logger.error("%s not found, aborting." % (config["gtf"]))
    if not file_exists(stage_config["program"]):
        logger.error("%s not found, aborting." % (stage_config["program"]))
Example #11
0
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program",
                                                    "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cutadapt(in_file, self.options, adapters,
                     quality_base=quality_base,
                     _out=temp_cut.name)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cutadapt(temp_cut.name, self.options, "-a",
                         polya, "-a", self._rc_adapters(polya),
                         quality_base=quality_base,
                         _out=temp_out)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cutadapt(in_file, self.options, adapters,
                         _out=temp_out)
            return out_file
Example #12
0
def run_with_config(input_file, config, stage, out_file=None):
    stage_config = config["stage"][stage]
    options = stage_config.get("options", [])

    if out_file is None:
        out_dir = os.path.join(config["dir"].get("results", None), stage)
        out_file = os.path.join(out_dir, _get_outfilename(input_file))

    safe_makedir(out_dir)
    if "annotation" not in config:
        logger.error("annotation must appear in the config file, see example "
                     "configuration files.")
        exit(1)
    ref = prepare_ref_file(config["annotation"], config)
    out_file = run(input_file, ref, options, out_file)
    return out_file
Example #13
0
def setup_pipeline(config):
    """
    creates output directories
    and performs some minor validation on the configuration file
    """
    # make initial directories
    if "dir" not in config:
        logger.error("'dir' must be in config file, see example "
                     " configurations.")
        exit(-1)
    config = _setup_config(config)
    map(safe_makedir, config["dir"].values())

    _write_config(config)

    return config
Example #14
0
def setup_pipeline(config):
    """
    creates output directories
    and performs some minor validation on the configuration file
    """
    # make initial directories
    if "dir" not in config:
        logger.error("'dir' must be in config file, see example "
                     " configurations.")
        exit(-1)
    config = _setup_config(config)
    map(safe_makedir, config["dir"].values())

    _write_config(config)

    return config
Example #15
0
def run_with_config(input_file, config, stage, out_file=None):
    stage_config = config["stage"][stage]
    options = stage_config.get("options", [])

    if out_file is None:
        out_dir = os.path.join(config["dir"].get("results", None), stage)
        out_file = os.path.join(out_dir, _get_outfilename(input_file))

    safe_makedir(out_dir)
    if "annotation" not in config:
        logger.error("annotation must appear in the config file, see example "
                     "configuration files.")
        exit(1)
    ref = prepare_ref_file(config["annotation"], config)
    out_file = run(input_file, ref, options, out_file)
    return out_file
Example #16
0
def annotate_table_with_biomart(in_file,
                                join_column,
                                filter_type,
                                organism,
                                out_file=None):
    """
    join_column is the column to combine the perform the lookups on
    filter_type describes the type of the join_column (see the getBM
    documentation in R for details), organism is the english name of
    the organism

    example:
    annotate_table_with_biomart(in_file, "id", "ensembl_gene_id",
                                "human")

    """

    if organism not in ORG_TO_ENSEMBL:
        logger.error("organism not supported")
        exit(1)

    logger.info("Annotating %s." % (organism))
    if not out_file:
        out_file = append_stem(in_file, "annotated")
    if os.path.exists(out_file):
        return out_file
    # use biomaRt to annotate the data file
    r = robjects.r
    r.assign('join_column', join_column)
    r.assign('in_file', in_file)
    r.assign('out_file', out_file)
    r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"])
    r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"])
    r.assign('filter_type', filter_type)
    r('''
    library(biomaRt)
    ensembl = useMart("ensembl", dataset = ensembl_gene)
    d = read.table(in_file, header=TRUE)
    a = getBM(attributes=c(filter_type,
                gene_symbol, "description"),
                filters=c(filter_type), values=d[,join_column],
                mart=ensembl)
    m = merge(d, a, by.x=join_column, by.y=filter_type)
    write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t")
    ''')

    return out_file
Example #17
0
def annotate_table_with_biomart(in_file, join_column,
                                filter_type, organism, out_file=None):
    """
    join_column is the column to combine the perform the lookups on
    filter_type describes the type of the join_column (see the getBM
    documentation in R for details), organism is the english name of
    the organism

    example:
    annotate_table_with_biomart(in_file, "id", "ensembl_gene_id",
                                "human")

    """

    if organism not in ORG_TO_ENSEMBL:
        logger.error("organism not supported")
        exit(1)

    logger.info("Annotating %s." % (organism))
    if not out_file:
        out_file = append_stem(in_file, "annotated")
    if os.path.exists(out_file):
        return out_file
    # use biomaRt to annotate the data file
    r = robjects.r
    r.assign('join_column', join_column)
    r.assign('in_file', in_file)
    r.assign('out_file', out_file)
    r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"])
    r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"])
    r.assign('filter_type', filter_type)
    r('''
    library(biomaRt)
    ensembl = useMart("ensembl", dataset = ensembl_gene)
    d = read.table(in_file, header=TRUE)
    a = getBM(attributes=c(filter_type,
                gene_symbol, "description"),
                filters=c(filter_type), values=d[,join_column],
                mart=ensembl)
    m = merge(d, a, by.x=join_column, by.y=filter_type)
    write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t")
    ''')

    return out_file
Example #18
0
def wig2bigwig(wiggle_file, chrom_size_file, out_file):
    """
    convert wiggle file to bigwig file using the UCSC tool
    """
    PROGRAM = "wigToBigWig"
    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if file_exists(out_file):
        return out_file

    wigToBigWig = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        wigToBigWig(wiggle_file, chrom_size_file, tx_out_file)
    return out_file
Example #19
0
    def scan(self, plugin_dir=None):

        files = os.listdir(PluginDirectory)
        if plugin_dir:
            sys.path.append(plugin_dir)
            files += os.listdir(plugin_dir)

        plugins = []
        for fn in files:
            if fn.endswith('.py'):
                plugins.append(fn)
        mods = [fn.split('.')[0] for fn in plugins]
        # build the map of plugins
        for modname in mods:
            try:
                mod = import_(modname, PackagePrefix)
            except:
                logger.error("Error loading plugin: %s" % modname)
                traceback.print_exc()
                continue
            self.scan_module(mod)
Example #20
0
    def scan(self, plugin_dir=None):

        files = os.listdir(PluginDirectory)
        if plugin_dir:
            sys.path.append(plugin_dir)
            files += os.listdir(plugin_dir)

        plugins = []
        for fn in files:
            if fn.endswith('.py'):
                plugins.append(fn)
        mods = [fn.split('.')[0] for fn in plugins]
        # build the map of plugins
        for modname in mods:
            try:
                mod = import_(modname, PackagePrefix)
            except:
                logger.error("Error loading plugin: %s" % modname)
                traceback.print_exc()
                continue
            self.scan_module(mod)
Example #21
0
def mappable_function(x):
    logger.error("This is an error.")
    logger.info("This is info.")
    return x ** 10
Example #22
0
def mappable_function(x):
    logger.error("This is an error.")
    logger.info("This is info.")
    return x**10