Example #1
0
 def _start_message(self, in_file, **kwargs):
     if kwargs:
         logger.info("Starting %s on %s with arguments %s." % (self.stage,
                                                               in_file,
                                                               kwargs))
     else:
         logger.info("Starting %s on %s." % (self.stage, in_file))
Example #2
0
def start_cluster(cluster_config):
    global cluster, view, client, direct_view
    cluster = Cluster(**cluster_config["cluster"])
    logger.info("Starting the cluster with %d nodes." % (cluster.n))
    cluster.start()
    sleep(cluster.delay)

    # only continue when the cluster is completely up
    slept = 0
    while not cluster.is_up():
        sleep(cluster.delay)
        slept = slept + cluster.delay
        if slept > cluster_config["cluster"].get("timeout", DEFAULT_CLUSTER_TIMEOUT):
            logger.error("Cluster startup timed out.")
            cluster.stop()
            exit(-1)
    # only continue if at least one engine is up

    logger.info("Cluster up.")
    client = cluster.client()
    view = cluster.view()
    direct_view = cluster.direct_view()
    engine_config = cluster_config.copy()
    engine_config["engine_log"] = True
    direct_view["config"] = engine_config
    direct_view.execute("from bipy.log import setup_logging")
    direct_view.execute("setup_logging(config)")
Example #3
0
def start_cluster(cluster_config):
    global cluster, view, client, direct_view
    cluster = Cluster(**cluster_config["cluster"])
    logger.info("Starting the cluster with %d nodes." % (cluster.n))
    cluster.start()
    sleep(cluster.delay)

    # only continue when the cluster is completely up
    slept = 0
    while (not cluster.is_up()):
        sleep(cluster.delay)
        slept = slept + cluster.delay
        if (slept > cluster_config["cluster"].get("timeout",
                                                  DEFAULT_CLUSTER_TIMEOUT)):
            logger.error("Cluster startup timed out.")
            cluster.stop()
            exit(-1)
    # only continue if at least one engine is up

    logger.info("Cluster up.")
    client = cluster.client()
    view = cluster.view()
    direct_view = cluster.direct_view()
    engine_config = cluster_config.copy()
    engine_config["engine_log"] = True
    direct_view['config'] = engine_config
    direct_view.execute('from bipy.log import setup_logging')
    direct_view.execute('setup_logging(config)')
Example #4
0
def hard_clip(in_file, bases=8, right_side=True, quality_format="sanger", out_file=None):
    """
    hard clip a fastq file by removing N bases from each read
    bases is the number of bases to clip
    right_side is True to trim from the right side, False to trim from
    the left

    example: hard_clip(fastq_file, bases=4, end="5prime")

    """
    if right_side:
        logger.info("Hard clipping %d bases from the right side of "
                    "reads in %s." % (bases, in_file))
    else:
        logger.info("Hard clipping %d bases from the left side of "
                    "reads in %s." % (bases, in_file))

    quality_type = QUALITY_TYPE_HARD_TRIM[quality_format]
    if not out_file:
        out_file = append_stem(in_file, "clip")
    if file_exists(out_file):
        return out_file
    in_iterator = SeqIO.parse(in_file, quality_type)

    out_iterator = (_trim_read(record, bases, right_side) for
                    record in in_iterator)
    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            SeqIO.write(out_iterator, out_handle, quality_type)
    return out_file
Example #5
0
def genebody_coverage2(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts, takes a bam file,
    converts it to bigwig and then uses that
    """
    PROGRAM = "geneBody_coverage2.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    in_bigwig = bam2bigwig(in_file, config)
    prefix = "coverage"
    out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage")
    safe_makedir(out_dir)
    out_prefix = out_dir + "/wiggle"
    #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    coverage_run(i=in_bigwig, r=bed, o=out_prefix, t="pdf")
    return coverage_plot_file
Example #6
0
def filter_reads_by_length(fq1, fq2, min_length=30):
    """
    removes reads which are empty a pair of fastq files

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    # just pick the first one if it can be multiple types
    quality_type = QUALITY_TYPE[DetectFastqFormat.run(fq1)[0]]
    fq1_out = append_stem(fq1, "fixed")
    fq2_out = append_stem(fq2, "fixed")
    fq1_single = append_stem(fq1, "singles")
    fq2_single = append_stem(fq2, "singles")
    if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_type)
    fq2_in = SeqIO.parse(fq2, quality_type)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_type))
                fq2_out_handle.write(fq2_record.format(quality_type))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_type))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_type))

    return [fq1_out, fq2_out]
Example #7
0
def _run_fastqc(curr_files, config):
    logger.info("Running fastqc on %s" % (str(curr_files)))
    nfiles = len(curr_files)
    fastqc_config = config["stage"]["fastqc"]
    out_files = view.map(fastqc.run, curr_files,
                         [fastqc_config] * nfiles,
                         [config] * nfiles)
    return out_files
Example #8
0
    def _run_se(self, in_file):
        # cut polyA tails and adapters off
        logger.info("Running cutadapt in single end mode on %s." % (in_file))
        trimmed_file = self._cut_file(in_file)
        out_file = self._get_lf_file(trimmed_file)
        if file_exists(out_file):
            return out_file
        fastq.filter_single_reads_by_length(trimmed_file, self.length_cutoff)

        return out_file
Example #9
0
    def _run_pe(self, in_files):
        logger.info("Running cutadapt in paired end mode on %s." % (in_files))
        trimmed_files = map(self._cut_file, in_files)
        out_files = map(self._get_lf_file, trimmed_files)
        if all(map(file_exists, out_files)):
            return out_files
        fastq.filter_reads_by_length(trimmed_files[0], trimmed_files[1],
                                     self.length_cutoff)

        return out_files
Example #10
0
    def _run_pe(self, in_files):
        logger.info("Running cutadapt in paired end mode on %s." % (in_files))
        trimmed_files = map(self._cut_file, in_files)
        out_files = map(self._get_lf_file, trimmed_files)
        if all(map(file_exists, out_files)):
            return out_files
        fastq.filter_reads_by_length(trimmed_files[0], trimmed_files[1],
                                     self.length_cutoff)

        return out_files
Example #11
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    in_dir = config["dir"]["data"]
    id_file = config["id_file"]
    curr_files = input_files_from_dir(in_dir, id_file)
    logger.info("Running pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = fastqc.FastQC(config)
            view.map(stage_runner, curr_files, block=False)

        if stage == "cutadapt":
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = trim.Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "bowtie":
            logger.info("Running bowtie on %s." % (curr_files))
            bowtie = Bowtie(config)
            curr_files = view.map(bowtie, curr_files)
            mapped = view.map(sam.only_mapped, curr_files)
            unmapped = view.map(sam.only_unmapped, curr_files)
            curr_files = mapped
            bam_files = view.map(sam.sam2bam, mapped)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)


        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

    stop_cluster()
Example #12
0
    def _run_se(self, in_file):
        # cut polyA tails and adapters off
        logger.info("Running cutadapt in single end mode on %s." % (in_file))
        trimmed_file = self._cut_file(in_file)
        out_file = self._get_lf_file(trimmed_file)
        if file_exists(out_file):
            return out_file
        fastq.filter_single_reads_by_length(trimmed_file,
                                            self.length_cutoff)

        return out_file
Example #13
0
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program", "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cutadapt(in_file,
                     self.options,
                     adapters,
                     quality_base=quality_base,
                     _out=temp_cut.name)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cutadapt(temp_cut.name,
                         self.options,
                         "-a",
                         polya,
                         "-a",
                         self._rc_adapters(polya),
                         quality_base=quality_base,
                         _out=temp_out)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cutadapt(in_file, self.options, adapters, _out=temp_out)
            return out_file
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    in_dir = config["dir"]["data"]
    id_file = config["id_file"]
    curr_files = input_files_from_dir(in_dir, id_file)
    logger.info("Running pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = fastqc.FastQC(config)
            view.map(stage_runner, curr_files, block=False)

        if stage == "cutadapt":
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = trim.Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "bowtie":
            logger.info("Running bowtie on %s." % (curr_files))
            bowtie = Bowtie(config)
            curr_files = view.map(bowtie, curr_files)
            mapped = view.map(sam.only_mapped, curr_files)
            unmapped = view.map(sam.only_unmapped, curr_files)
            curr_files = mapped
            bam_files = view.map(sam.sam2bam, mapped)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

    stop_cluster()
Example #15
0
def test_cluster():
    with open(CONFIG_FILE) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    logger.info("Serial result")
    serial_result = map(mappable_function, range(32))
    logger.info("Parallel result")
    parallel_result = view.map(mappable_function, range(32))
    assert (serial_result == parallel_result)
Example #16
0
 def __init__(self, config):
     self.config = config
     self.plugins = {}
     #self.scan(get_in(config, "dir", "plugins"))
     plugin_dir = get_in(config, ("dir", "plugins"))
     if plugin_dir:
         logger.info("Scanning %s for plugins." % plugin_dir)
         plugins = types.ModuleType("plugins")
         plugins.__path__ = [plugin_dir]
         sys.modules["plugins"] = plugins
         self.scan(plugin_dir)
     else:
         self.scan()
Example #17
0
 def __init__(self, config):
     self.config = config
     self.plugins = {}
     #self.scan(get_in(config, "dir", "plugins"))
     plugin_dir = get_in(config, ("dir", "plugins"))
     if plugin_dir:
         logger.info("Scanning %s for plugins." % plugin_dir)
         plugins = types.ModuleType("plugins")
         plugins.__path__ = [plugin_dir]
         sys.modules["plugins"] = plugins
         self.scan(plugin_dir)
     else:
         self.scan()
Example #18
0
def test_cluster():
    with open(CONFIG_FILE) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    logger.info("Serial result")
    serial_result = map(mappable_function, range(32))
    logger.info("Parallel result")
    parallel_result = view.map(mappable_function, range(32))
    assert(serial_result == parallel_result)
Example #19
0
def _run_trim(curr_files, config):
    logger.info("Trimming poor quality ends from %s" % (str(curr_files)))
    nfiles = len(curr_files)
    min_length = str(config["stage"]["trim"].get("min_length", 20))
    pair = str(config["stage"]["trim"].get("pair", "se"))
    platform = str(config["stage"]["trim"].get("platform", "sanger"))
    out_dir = os.path.join(config["dir"]["results"], "trimmed")
    safe_makedir(out_dir)
    out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files]
    out_files = [os.path.join(out_dir, x) for x in out_files]
    out_files = view.map(sickle.run, curr_files, [pair] * nfiles,
                         [platform] * nfiles, [min_length] * nfiles, out_files)
    return out_files
Example #20
0
 def is_up(self):
     """ returns True if the cluster is completely up and false otherwise """
     try:
         up = len(self.client().ids)
     except IOError:
         logger.info("Waiting for the controller to come up.")
         return False
     else:
         not_up = self.n - up
         if not_up > 0:
             logger.info("Waiting for %d engines to come up." % (not_up))
             return False
         else:
             return True
Example #21
0
 def is_up(self):
     """ returns True if the cluster is completely up and false otherwise """
     try:
         up = len(self.client().ids)
     except IOError:
         logger.info("Waiting for the controller to come up.")
         return False
     else:
         not_up = self.n - up
         if not_up > 0:
             logger.info("Waiting for %d engines to come up." % (not_up))
             return False
         else:
             return True
Example #22
0
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program",
                                                    "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cutadapt(in_file, self.options, adapters,
                     quality_base=quality_base,
                     _out=temp_cut.name)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cutadapt(temp_cut.name, self.options, "-a",
                         polya, "-a", self._rc_adapters(polya),
                         quality_base=quality_base,
                         _out=temp_out)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cutadapt(in_file, self.options, adapters,
                         _out=temp_out)
            return out_file
Example #23
0
File: macs.py Project: roryk/bipy
def run_with_config(input_file, config, control_file=None, stage=None):

    if stage is None:
        stage = "macs"

    if stage not in config["stage"]:
        logger.info("Cannot find the the stage %s in the config." % (stage))

    stage_config = config["stage"][stage]
    options = stage_config.get("options", [])
    out_dir = os.path.join(config["dir"].get("results", None), stage)
    safe_makedir(out_dir)
    out_files = run(input_file, options, control_file, out_dir)
    print out_files
    return out_files
Example #24
0
def _download_encode(input_file, config):
    """ grab the encode files they listed in their file """
    NAME_FIELD = 0
    if not os.path.exists(input_file):
        logger.info("Error %s does not exist, aborting." % (input_file))
        exit(-1)

    with open(input_file) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        files = [x[NAME_FIELD] for x in reader]
    logger.info("Downloading %s." % (files))
    data_dir = config["dir"].get("data", "data")
    out_files = view.map(_download_ref, files, [data_dir] * len(files))

    return out_files
Example #25
0
def run_with_config(input_file, config, control_file=None, stage=None):

    if stage is None:
        stage = "macs"

    if stage not in config["stage"]:
        logger.info("Cannot find the the stage %s in the config." % (stage))

    stage_config = config["stage"][stage]
    options = stage_config.get("options", [])
    out_dir = os.path.join(config["dir"].get("results", None), stage)
    safe_makedir(out_dir)
    out_files = run(input_file, options, control_file, out_dir)
    print out_files
    return out_files
Example #26
0
def _download_encode(input_file, config):
    """ grab the encode files they listed in their file """
    NAME_FIELD = 0
    if not os.path.exists(input_file):
        logger.info("Error %s does not exist, aborting." % (input_file))
        exit(-1)

    with open(input_file) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        files = [x[NAME_FIELD] for x in reader]
    logger.info("Downloading %s." % (files))
    data_dir = config["dir"].get("data", "data")
    out_files = view.map(_download_ref, files, [data_dir] * len(files))

    return out_files
Example #27
0
def annotate_table_with_biomart(in_file,
                                join_column,
                                filter_type,
                                organism,
                                out_file=None):
    """
    join_column is the column to combine the perform the lookups on
    filter_type describes the type of the join_column (see the getBM
    documentation in R for details), organism is the english name of
    the organism

    example:
    annotate_table_with_biomart(in_file, "id", "ensembl_gene_id",
                                "human")

    """

    if organism not in ORG_TO_ENSEMBL:
        logger.error("organism not supported")
        exit(1)

    logger.info("Annotating %s." % (organism))
    if not out_file:
        out_file = append_stem(in_file, "annotated")
    if os.path.exists(out_file):
        return out_file
    # use biomaRt to annotate the data file
    r = robjects.r
    r.assign('join_column', join_column)
    r.assign('in_file', in_file)
    r.assign('out_file', out_file)
    r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"])
    r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"])
    r.assign('filter_type', filter_type)
    r('''
    library(biomaRt)
    ensembl = useMart("ensembl", dataset = ensembl_gene)
    d = read.table(in_file, header=TRUE)
    a = getBM(attributes=c(filter_type,
                gene_symbol, "description"),
                filters=c(filter_type), values=d[,join_column],
                mart=ensembl)
    m = merge(d, a, by.x=join_column, by.y=filter_type)
    write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t")
    ''')

    return out_file
Example #28
0
def _run_trim(curr_files, config):
    logger.info("Trimming poor quality ends from %s" % (str(curr_files)))
    nfiles = len(curr_files)
    min_length = str(config["stage"]["trim"].get("min_length", 20))
    pair = str(config["stage"]["trim"].get("pair", "se"))
    platform = str(config["stage"]["trim"].get("platform", "sanger"))
    out_dir = os.path.join(config["dir"]["results"], "trimmed")
    safe_makedir(out_dir)
    out_files = [append_stem(os.path.basename(x), "trim") for
                 x in curr_files]
    out_files = [os.path.join(out_dir, x) for x in out_files]
    out_files = view.map(sickle.run, curr_files,
                         [pair] * nfiles,
                         [platform] * nfiles,
                         [min_length] * nfiles,
                         out_files)
    return out_files
Example #29
0
def filter_single_reads_by_length(in_file, min_length=30):
    """
    removes reads from a fastq file which are below a min_length in bases

    """
    logger.info("Removing reads in %s thare are less than %d bases."
                % (in_file, min_length))
    quality_type = QUALITY_TYPE[DetectFastqFormat.run(in_file)[0]]
    out_file = append_stem(in_file, "fixed")
    if file_exists(out_file):
        return out_file
    in_iterator = SeqIO.parse(in_file, quality_type)
    out_iterator = (record for record in in_iterator if
                    len(record.seq) > min_length)
    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            SeqIO.write(out_iterator, out_handle, quality_type)
    return out_file
Example #30
0
def annotate_table_with_biomart(in_file, join_column,
                                filter_type, organism, out_file=None):
    """
    join_column is the column to combine the perform the lookups on
    filter_type describes the type of the join_column (see the getBM
    documentation in R for details), organism is the english name of
    the organism

    example:
    annotate_table_with_biomart(in_file, "id", "ensembl_gene_id",
                                "human")

    """

    if organism not in ORG_TO_ENSEMBL:
        logger.error("organism not supported")
        exit(1)

    logger.info("Annotating %s." % (organism))
    if not out_file:
        out_file = append_stem(in_file, "annotated")
    if os.path.exists(out_file):
        return out_file
    # use biomaRt to annotate the data file
    r = robjects.r
    r.assign('join_column', join_column)
    r.assign('in_file', in_file)
    r.assign('out_file', out_file)
    r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"])
    r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"])
    r.assign('filter_type', filter_type)
    r('''
    library(biomaRt)
    ensembl = useMart("ensembl", dataset = ensembl_gene)
    d = read.table(in_file, header=TRUE)
    a = getBM(attributes=c(filter_type,
                gene_symbol, "description"),
                filters=c(filter_type), values=d[,join_column],
                mart=ensembl)
    m = merge(d, a, by.x=join_column, by.y=filter_type)
    write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t")
    ''')

    return out_file
Example #31
0
def junction_annotation(in_file, config, out_prefix=None):
    """
    compile novel/known information about splice junctions
    """
    PROGRAM = "junction_annotation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "junction"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    junction_file = out_prefix + ".splice_junction.pdf"
    if file_exists(junction_file):
        return junction_file
    junction_run = sh.Command(which(PROGRAM))
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    junction_run(i=in_file, o=out_prefix, r=bed)
    return junction_file
Example #32
0
def RPKM_count(in_file, config, out_prefix=None):
    """
    produce RPKM
    """
    PROGRAM = "RPKM_count.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "RPKM_count"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    rpkm_count_file = out_prefix + "_read_count.xls"
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    if file_exists(rpkm_count_file):
        return rpkm_count_file
    RPKM_count_run = sh.Command(which(PROGRAM))
    RPKM_count_run(i=in_file, r=bed, o=out_prefix)
    return rpkm_count_file
Example #33
0
def genebody_coverage(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts
    """
    PROGRAM = "geneBody_coverage.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "coverage"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    coverage_run(i=in_file, r=bed, o=out_prefix)
    return coverage_plot_file
Example #34
0
def bam_stat(in_file, config, out_prefix=None):
    """
    dump read maping statistics from a SAM or BAM file to out_file
    """
    PROGRAM = "bam_stat.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bam_stat"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    out_file = out_prefix + ".txt"
    if file_exists(out_file):
        return out_file

    bam_stat_run = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        bam_stat_run(i=in_file, _err=tx_out_file)

    return out_file
Example #35
0
def RPKM_saturation(in_file, config, out_prefix=None):
    """
    estimate the precision of RPKM calculation by resampling
    """
    PROGRAM = "RPKM_saturation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "RPKM_saturation"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    rpkm_saturation_file = out_prefix + ".saturation.pdf"
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)

    if file_exists(rpkm_saturation_file):
        return rpkm_saturation_file

    RPKM_saturation_run = sh.Command(which(PROGRAM))
    RPKM_saturation_run(i=in_file, r=bed, o=out_prefix)
    return rpkm_saturation_file
Example #36
0
def junction_saturation(in_file, config, out_prefix=None):
    """
    check if splicing is deep enough to perform alternative splicing
    analysis
    """
    PROGRAM = "junction_saturation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "saturation"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    saturation_file = out_prefix + ".junctionSaturation_plot.pdf"
    if file_exists(saturation_file):
        return saturation_file

    saturation_run = sh.Command(which(PROGRAM))
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    saturation_run(i=in_file, o=out_prefix, r=bed)
    return saturation_file
Example #37
0
def run(in_file, ref, blastn_config, config):
    logger.info("Preparing the reference file for %s." % (ref.get("name")))
    ref_file = prepare_ref_file(ref, config)
    logger.info("Preparing the blast database for %s." % (ref.get("name")))
    blast_db = prepare_blast_db(ref_file, "nucl")
    logger.info("Blasting %s against %s." % (in_file, ref.get("name")))

    results_dir = build_results_dir(blastn_config, config)
    utils.safe_makedir(results_dir)

    out_file = os.path.join(
        results_dir,
        replace_suffix(os.path.basename(in_file),
                       ref.get("name") + "hits.tsv"))
    tmp_out = out_file + ".tmp"

    blast_results = blast_search(in_file, blast_db, tmp_out)
    #logger.info("Filtering results for at least %f percent of the "
    #            "sequences covered." %(0.5*100))
    #filtered_results = filter_results_by_length(blast_results, 0.5)
    #logger.info("Filtered output file here: %s" %(filtered_results))
    with open(blast_results) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_file, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                writer.writerow(line)

    return out_file
Example #38
0
def run(in_file, ref, blastn_config, config):
    logger.info("Preparing the reference file for %s." % (ref.get("name")))
    ref_file = prepare_ref_file(ref, config)
    logger.info("Preparing the blast database for %s." % (ref.get("name")))
    blast_db = prepare_blast_db(ref_file, "nucl")
    logger.info("Blasting %s against %s." % (in_file, ref.get("name")))

    results_dir = build_results_dir(blastn_config, config)
    utils.safe_makedir(results_dir)

    out_file = os.path.join(results_dir,
                            replace_suffix(os.path.basename(in_file),
                                           ref.get("name") + "hits.tsv"))
    tmp_out = out_file + ".tmp"

    blast_results = blast_search(in_file, blast_db, tmp_out)
    #logger.info("Filtering results for at least %f percent of the "
    #            "sequences covered." %(0.5*100))
    #filtered_results = filter_results_by_length(blast_results, 0.5)
    #logger.info("Filtered output file here: %s" %(filtered_results))
    with open(blast_results) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_file, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                writer.writerow(line)

    return out_file
Example #39
0
def clipping_profile(in_file, config, out_prefix=None):
    """
    estimate the clipping profile of the reads
    """
    PROGRAM = "clipping_profile.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "clipping"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping")
    clip_plot_file = out_prefix + ".clipping_profile.pdf"
    print clip_plot_file
    if file_exists(clip_plot_file):
        return clip_plot_file

    clip_run = sh.Command(which(PROGRAM))
    clip_run(i=in_file, o=out_prefix)
    # hack to get around the fact that clipping_profile saves the file in
    # the script execution directory
    #sh.mv("clipping_profile.pdf", clip_plot_file)

    return clip_plot_file
Example #40
0
def bam2bigwig(in_file, config, out_prefix=None):
    """
    assumes the library preparation was not strand specific for now
    """
    PROGRAM = "bam2wig.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bigwig"
    chrom_size_file = config["annotation"].get("chrom_size_file", None)
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    if not chrom_size_file:
        chrom_size_file = _fetch_chrom_sizes(config)
    wiggle_file = out_prefix + ".wig"

    if not file_exists(wiggle_file):
        bam2wig = sh.Command(which(PROGRAM))
        bam2wig(i=in_file, s=chrom_size_file, o=out_prefix)

    bigwig_file = out_prefix + ".bw"

    return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
Example #41
0
 def __call__(self, in_file):
     self._start_message(in_file)
     if isinstance(in_file, basestring):
         logger.info("Detected %s as non-paired." % in_file)
         out_file = run_with_config(in_file, None, self.ref,
                                    self.stage, self.config)
     elif is_pair(in_file):
         logger.info("Detected %s as a pair." % in_file)
         out_file = run_with_config(in_file[0], in_file[1],
                                    self.ref, self.stage, self.config)
     else:
         logger.info("Detected %s as non-paired." % in_file)
         out_file = run_with_config(in_file[0], None, self.ref,
                                    self.stage, self.config)
     self._end_message(in_file)
     return out_file
Example #42
0
 def _end_message(self, in_file):
     logger.info("%s complete on %s." % (self.stage, in_file))
Example #43
0
 def _start_message(self, in_file):
     logger.info("Starting %s on %s." % (self.stage, in_file))
Example #44
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" %(conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                logger.info("Running fastqc on %s." % (curr_files))
                stage_runner = FastQC(config)
                view.map(stage_runner, curr_files)

            if stage == "cutadapt":
                logger.info("Running cutadapt on %s." % (curr_files))
                stage_runner = Cutadapt(config)
                curr_files = view.map(stage_runner, curr_files)

            if stage == "tophat":
                logger.info("Running tophat on %s." % (curr_files))
                stage_runner = Tophat(config)
                tophat_outputs = view.map(stage_runner, curr_files)
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [replace_suffix(os.path.basename(x),
                                            "metrics") for x in curr_files]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun,
                                     curr_files,
                                     [ref] * nrun,
                                     [ribo] * nrun,
                                     out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f,
                                                      None,
                                                      out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" % (combined_out,
                                               conditions,
                                               deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0],
                                                 "id",
                                                 "ensembl_gene_id",
                                                 "human")

    # end gracefully
    stop_cluster()
Example #45
0
def main(config_file):
    """ this assumes that we are keeping the same order of the files
    throughout """
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    input_dict = config["input"]
    curr_files = _make_current_files(input_dict.keys())
    input_meta = input_dict.values()

    for stage in config["run"]:
        if stage == "fastqc":
            _emit_stage_message(stage, curr_files)
            fastqc_config = _get_stage_config(config, stage)
            fastqc_args = zip(*product(curr_files, [fastqc_config],
                                       [config]))
            view.map(fastqc.run, *fastqc_args)

        if stage == "cutadapt":
            _emit_stage_message(stage, curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                         [config]))
            cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
            curr_files = _make_current_files(cutadapt_outputs)

        if stage == "tophat":
            _emit_stage_message(stage, curr_files)
            tophat_config = _get_stage_config(config, stage)
            tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                       ["tophat"], [config]))
            tophat_outputs = view.map(tophat.run_with_config, *tophat_args)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs


        if stage == "htseq-count":
            _emit_stage_message(stage, curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            combined_out = os.path.join(config["dir"]["results"], stage,
                                        "all_combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs, None,
                                                      out_file=combined_out)

        if stage == "rseqc":
            _emit_stage_message(stage, curr_files)
            rseqc_config = _get_stage_config(config, stage)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_transcript_id"],
                                         ["mouse"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
            view.map(rseqc.RPKM_saturation, *RPKM_args)

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "deseq":
            _emit_stage_message(stage, curr_files)
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            for test in deseq_config["tests"]:
                indexes = [_find_file_index_for_test(input_meta,
                                                     condition) for
                                                     condition in test]
                files = [htseq_outputs[x] for x in indexes]
                conditions = [input_meta[x]["condition"] for x in indexes]
                combined_out = os.path.join(out_dir,
                                            "_".join(conditions) +
                                            "_combined.counts")
                logger.info("Combining %s to %s." % (files, combined_out))
                count_file = htseq_count.combine_counts(files, None,
                                                        out_file=combined_out)
                out_file = os.path.join(out_dir, "_".join(conditions) +
                                        "_deseq.txt")
                logger.info("Running deseq on %s with conditions %s "
                            "and writing to %s" % (count_file,
                                                   conditions,
                                                   out_file))
                view.map(deseq.run, [count_file], [conditions], [out_file])
                #deseq.run(count_file, conditions, out_file=out_file)

    # end gracefully
    stop_cluster()
Example #46
0
def mappable_function(x):
    logger.error("This is an error.")
    logger.info("This is info.")
    return x ** 10
Example #47
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["input_dir"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)
            logger.info("Output of cutadapt: %s." % (curr_files))

        if stage == "bowtie":
            logger.info("Running Bowtie on %s." % (curr_files))
            bowtie = Bowtie(config)
            bowtie_outputs = view.map(bowtie, curr_files)
            bamfiles = view.map(sam.sam2bam, bowtie_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            
        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (curr_files))
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            htseq.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Example #48
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            sortsam = view.map(sam.coordinate_sort_sam, tophat_outputs,
                               [config] * len(tophat_outputs))
            bamfiles = view.map(sam.sam2bam, sortsam)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            down_args = zip(*product(final_bamfiles, [40000000]))
            down_bam = view.map(sam.downsample_bam, *down_args)
            view.map(rseqc.genebody_coverage, down_bam,
                     [config] * len(down_bam))
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Example #49
0
def _emit_stage_message(stage, curr_files):
    logger.info("Running %s on %s" % (stage, curr_files))
Example #50
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [append_stem(os.path.basename(x), "trim") for
                         x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files,
                                 ["se"] * nlen,
                                 ["sanger"] * nlen,
                                 [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [append_stem(os.path.basename(input_file[0]),
                         "filt") for input_file in tagdust_outputs]
            out_dir = os.path.join(config["dir"]["results"],
                                   "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq")
                              for x, y in zip(tagdust_outputs, out_files)]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")
            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [reduce(count_ends,
                                 apply_seqio(x, end_function, kind="fastq"),
                                 {}) for x in curr_files]
                df = pd.DataFrame(counts,
                                  index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align,
                                    curr_files,
                                    [pair_file] * nlen,
                                    [ref_file] * nlen,
                                    [out_base] * nlen,
                                    [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles),
                                 bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s"
                        % (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for
                         x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [os.path.join(out_dir,
                                      os.path.basename(x)) for x in out_files]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf),
                     out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs,
                                     [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [os.path.join(out_dir, os.path.basename(x)) for x in
                             out_files]
                out_files = ["_vs_".join([x, os.path.basename(bedbase)])
                             for x in out_files]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf),
                               out_files)
                count_files = [replace_suffix(x, "stats") for x in
                               out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Example #51
0
def _emit_stage_message(stage, curr_files):
    logger.info("Running %s on %s" % (stage, curr_files))
Example #52
0
 def _start_message(self, in_file):
     logger.info("Starting %s on %s." % (self.stage, in_file))
Example #53
0
 def _end_message(self, in_file):
     logger.info("%s complete on %s." % (self.stage, in_file))
Example #54
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs


    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file


    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]


    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")
Example #55
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Example #56
0
 def __call__(self, in_file, MAX_RECORDS=1000000):
     logger.info("Detecting format of %s" % (in_file))
     quality = self.run(in_file, MAX_RECORDS)
     logger.info("Detected quality format of %s in %s." % (quality, in_file))
     return self.run(in_file, MAX_RECORDS)