Ejemplo n.º 1
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--regex-filename",
        dest="regex_filename",
        type="string",
        help="extract column name from filename via regular expression "
        "[%default]")

    parser.add_option("--filter",
                      dest="filters",
                      type="choice",
                      action="append",
                      choices=("PASS", "SNP"),
                      help="apply filters to VCFs when reading "
                      "[%default]")

    parser.set_defaults(
        regex_filename=None,
        filters=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("requiring at least 2 input filenames")

    dfs = []
    for filename in args:
        if options.regex_filename:
            try:
                name = re.search(options.regex_filename, filename).groups()[0]
            except AttributeError:
                raise ValueError(
                    "regular expression '{}' does not match {}".format(
                        options.regex_filename, filename))
        else:
            name = IOTools.snip(os.path.basename(filename), ".vcf.gz")

        E.debug("reading data from {}".format(filename))
        df = read_vcf_positions_into_dataframe(filename,
                                               filters=options.filters)
        df[name] = 1
        dfs.append(df)

    ndata = len(dfs)
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = merged_df.merge(df, how="outer")
    merged_df = merged_df.fillna(0)
    ddf = merged_df.drop(["chrom", "pos"], axis=1)
    set_counts = ddf.groupby(by=list(ddf.columns)).size()
    set_counts = set_counts.reset_index()
    set_counts.columns = list(set_counts.columns[:-1]) + ["counts"]

    set_counts.to_csv(options.stdout, sep="\t", index=False)
    E.stop()
Ejemplo n.º 2
0
def fastqscreen_filename2track(fn):
    """extract track name from fastqc filename.

    Because we deal with both paired end (track.fastq.1_fastqc
    and single end data (track_fastqc), this is a bit cumbersome.
    """
    return re.sub(".fastq.", "-",
                  IOTools.snip(os.path.basename(fn), "_screen.txt"))
Ejemplo n.º 3
0
def summarizeFastqScreen(infiles, outfiles):
    all_files = []
    for infile in infiles:
        all_files.extend(glob.glob(IOTools.snip(infile, "screen") + "*_screen.txt"))
    if len(all_files) == 0:
        E.warn("no fastqcscreen results to concatenate")
        for x in outfiles:
            IOTools.touch_file(x)
        return
    df_summary, df_details = PipelineReadqc.read_fastq_screen(
        all_files)
    df_summary.to_csv(outfiles[0], sep="\t", index=True)
    df_details.to_csv(outfiles[1], sep="\t", index=True)
Ejemplo n.º 4
0
def buildFastQCSummaryStatus(infiles, outfile, datadir):
    '''collect fastqc status results from multiple runs into a single table.

    Arguments
    ---------
    infiles : list
        List of filenames with fastqc output (logging information). The
        track name is derived from that.
    outfile : list
        Output filename in :term:`tsv` format.
    datadir : string
        Location of actual Fastqc output to be parsed.
    track_regex : string
        Regular expression to extract track from filename.
    '''

    outf = IOTools.open_file(outfile, "w")
    names = set()
    results = []
    for infile in infiles:
        base_track = IOTools.snip(os.path.basename(infile), ".fastqc")
        filename = os.path.join(datadir, base_track + "*_fastqc",
                                "fastqc_data.txt")
        # there can be missing sections
        for fn in glob.glob(filename):
            stats = collections.defaultdict(str)
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.open_file(fn)):
                stats[name] = status
            track = fastqc_filename2track(fn)
            results.append((track, fn, stats))
            names.update(list(stats.keys()))

    names = sorted(names)
    outf.write("track\tfilename\t%s\n" % "\t".join(names))
    for track, fn, stats in results:
        outf.write("%s\t%s\t%s\n" %
                   (track, os.path.dirname(fn), "\t".join(stats[x]
                                                          for x in names)))
    outf.close()
Ejemplo n.º 5
0
def runDE(design_file,
          counts_file,
          outfile,
          outdir,
          method="deseq",
          spike_file=None):
    '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py`

    The job is split into smaller sections. The order of the input
    data is randomized in order to avoid any biases due to chromosomes
    and break up local correlations.

    At the end, a q-value is computed from all results.

    Arguments
    ---------
    design_file : string
        Filename with experimental design
    counts_file : string
        :term:`tsv` formatted file with counts per windows
    outfile : string
       Output filename in :term:`tsv` format.
    outdir : string
       Directory for additional output files.
    method : string
       Method to use. See :mod:`scripts/runExpression.py`.
    spike_file : string
       Filename with spike-in data to add before processing.
    '''

    if spike_file is None:
        statement = "zcat %(counts_file)s"
    else:
        statement = '''cgat combine_tables
        --missing-value=0
        --cat=filename
        --log=%(outfile)s.log
        %(counts_file)s %(spike_file)s
        | cgat csv_cut
        --remove filename
        --log=%(outfile)s.log
        '''

    prefix = IOTools.snip(os.path.basename(outfile))
    E.info(prefix)

    # --bashrc=%(pipeline_scriptsdir)s/bashrc.cgat

    # the post-processing strips away the warning,
    # renames the qvalue column to old_qvalue
    # and adds a new qvalue column after recomputing
    # over all windows.
    statement += '''
    | cgat randomize_lines --keep-header=1
    | python -m CGATCore.Pipeline.farm
    --method=multiprocessing
    --cluster-options="-l mem_free=16G"
    --cluster-queue=%(cluster_queue)s
    --cluster-num-jobs=%(cluster_num_jobs)i
    --cluster-priority=%(cluster_priority)i
    --cluster-queue-manager=%(cluster_queue_manager)s
    --cluster-memory-resource=%(cluster_memory_resource)s
    --cluster-memory-default=%(cluster_memory_default)s
    --input-header
    --output-header
    --split-at-lines=200000
    --log=%(outfile)s.log
    --output-filename-pattern=%(outdir)s/%%s
    --subdirs
    --output-regex-header="^test_id"
    "cgat runExpression
              --method=%(method)s
              --tags-tsv-file=%%STDIN%%
              --design-tsv-file=%(design_file)s
              --output-filename-pattern=%%DIR%%%(prefix)s_
              --deseq-fit-type=%(deseq_fit_type)s
              --deseq-dispersion-method=%(deseq_dispersion_method)s
              --deseq-sharing-mode=%(deseq_sharing_mode)s
              --edger-dispersion=%(edger_dispersion)f
              --deseq2-design-formula=%(deseq2_model)s
              --deseq2-contrasts=%(deseq2_contrasts)s
              --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i
              --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i
              --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f
              --deseq2-plot=0 "
    | perl -p -e "s/qvalue/old_qvalue/"
    | cgat table2table
    --log=%(outfile)s.log
    --method=fdr
    --column=pvalue
    --fdr-method=BH
    --fdr-add-column=qvalue
    | gzip
    > %(outfile)s '''
    E.info(statement)

    P.run(statement)
Ejemplo n.º 6
0
def peek_parameters(workingdir,
                    pipeline,
                    on_error_raise=None,
                    prefix=None,
                    update_interface=False,
                    restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = get_caller_locals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not isTest() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    # Attempt to locate directory with pipeline source code. This is a
    # patch as pipelines might be called within the repository
    # directory or from an installed location
    dirname = PARAMS["pipelinedir"]

    # called without a directory, use current directory
    if dirname == "":
        dirname = os.path.abspath(".")
    else:
        # if not exists, assume we want version located
        # in directory of calling script.
        if not os.path.exists(dirname):
            # directory is path of calling script
            dirname = os.path.dirname(caller_locals['__file__'])

    pipeline = os.path.join(dirname, pipeline)
    if not os.path.exists(pipeline):
        if on_error_raise:
            raise ValueError("can't find pipeline at %s" % (pipeline))
        else:
            return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if ("config" in sys.argv or "check" in sys.argv
            or "clone" in sys.argv and workingdir == "?!"):
        workingdir = os.path.join(PARAMS.get("pipelinedir"),
                                  IOTools.snip(pipeline, ".py"))

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError("can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "python %s -f -v 0 dump" % pipeline
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") % (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                dump[key] = os.path.join(workingdir, value)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Ejemplo n.º 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--first-gtf-file",
        dest="gtf_a",
        type="string",
        help="supply a gtf file - will compress uncompressed files")
    parser.add_option(
        "-b",
        "--second-gtf-file",
        dest="gtf_b",
        type="string",
        help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s",
                      "--scripts-dir",
                      dest="scripts_dir",
                      type="string",
                      help="supply a location for accessory scripts")
    parser.add_option("--no-venn",
                      dest="no_venn",
                      action="store_true",
                      help="set if no venn is to be drawn")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            P.execute(statement)
        elif gtf.endswith(".gtf"):
            outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip  > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            E.execute(statement)
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" %
                             gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b
                                    ]) + ".intersection.gtf.gz"
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." %
               (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.open_file(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.open_file(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(
            IOTools.open_file(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print("counts for gtf-a: ", count_gtf_merged_a)

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print("counts for gtf-b: ", count_gtf_merged_b)

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print("counts for intersection: ", count_intersection)

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {
            "gtf-b":
            list(map(str, range(count_gtf_merged_b))),
            "gtf-a":
            list(map(str, range(count_intersection))) + list(
                map(str, [
                    random.random()
                    for i in range(count_intersection, count_gtf_merged_a)
                ]))
        }

        R_source = os.path.join(os.path.abspath(options.scripts_dir),
                                "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")

        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b)
        E.info("drawing venn diagram to %s" %
               (prefix_a + "_vs_" + prefix_b + ".overlap.png"))

        R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"],
                                  B=result[gtf_pair]["gtf-b"]),
                           prefix_a + "_vs_" + prefix_b + ".overlap.png", **{
                               'cat.cex':
                               1.5,
                               'main.fontfamily':
                               "Arial",
                               'cat.pos':
                               FloatVector((0, 0)),
                               'cat.fontfamily':
                               "Arial",
                               'main.cex':
                               1.8,
                               'height':
                               1000,
                               'width':
                               1000,
                               'cex':
                               2,
                               'fontfamily':
                               "Arial",
                               'lwd':
                               R.c(1, 1),
                               'fill':
                               R.c(R.rgb(0, 0, 0.5, 0.5),
                                   R.rgb(0.5, 0, 0, 0.5)),
                               'category.names':
                               R.c(prefix_a, prefix_b),
                               'margin':
                               R.c(0.1, 0.1, 0.1, 0.1)
                           })

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_file",
                      type="string",
                      help="supply input bam file name")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtf_file",
                      type="string",
                      help="supply input gtf file name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="supply output file name")

    parser.add_option(
        "-G",
        "--reference-gtf-file",
        dest="reference_gtf",
        type="string",
        help=
        "supply reference gtf for context of reads not contributing to transcripts"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    ######################################################
    ######################################################
    # for all alignments
    ######################################################
    ######################################################

    # open outfile and prepare headers
    outf = IOTools.open_file(options.outfile, "w")
    outf.write("\t".join([
        "total alignments", "aligments in transcripts",
        "percent alignments in transcripts", "total spliced alignments",
        "spliced alignments in transcripts",
        "percent spliced alignments in transcripts"
    ]) + "\n")

    # calculate coverage over transcript file - NB split reads contribute twice to the transcript
    # use BedTool object
    pybedbamfile = pybedtools.BedTool(options.bam_file)

    # count alignments
    E.info("counting total number of alignments and spliced alignments")
    total_alignments = 0
    spliced_alignments = 0

    for alignment in pybedbamfile:
        cigar = alignment[5]
        if cigar.find("N") != -1:  # N signifies split read
            total_alignments += 1
            spliced_alignments += 1
        else:
            total_alignments += 1

    # merge the gtf file to avoid double counting of exons in different
    # transcripts - converts to a bed file
    gtffile = pybedtools.BedTool(options.gtf_file).merge()

    E.info("computing coverage of aligments in %s over intervals in %s" %
           (options.bam_file, options.gtf_file))
    cover = pybedbamfile.coverage(gtffile)

    # make sure that the exons aren't being counted twice - shouldn't be
    # because of merge
    E.info("counting reads contributing to transcripts")
    c = 0
    for entry in cover:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    # sum the coverage across exons from all transcripts
    coverage_in_transcripts = c

    ######################################################
    ######################################################
    # for spliced alignments
    ######################################################
    ######################################################

    # count total number of spliced alignments
    # requires that the CIGAR string 'N' is present

    # uses pysam to write out a bam file of the spliced reads only
    allreads = pysam.AlignmentFile(options.bam_file)
    spliced_bamname = IOTools.snip(options.bam_file,
                                   ".bam") + "_spliced_reads.bam"

    # open file for outputting spliced alignments
    splicedreads = pysam.AlignmentFile(spliced_bamname,
                                       "wb",
                                       template=allreads)

    # cigar string in pysam for spliced alignment is (3, int)
    spliced = collections.defaultdict(list)
    for read in allreads:
        for cigar_tag in read.cigar:
            if cigar_tag[0] == 3:
                spliced[read].append(cigar_tag)

    # write out spliced alignments
    for read in list(spliced.keys()):
        splicedreads.write(read)
    splicedreads.close()
    allreads.close()

    # index splice reads bam file
    pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam"))
    pysam.index(spliced_bamname)

    # read in the spliced reads as a BedTool object
    splicedbam = pybedtools.BedTool(spliced_bamname)

    # perform coverage of spliced reads over intervals - will be twice
    # as many as there should be due to counting both exons
    # overlapping
    spliced_coverage = splicedbam.coverage(gtffile)

    # avoid double counting exons
    E.info("counting spliced reads contributing to transcripts")
    spliced_exons = {}
    c = 0
    for entry in spliced_coverage:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    spliced_coverage_in_transcripts = c

    # NOTE: the counting of spliced alignments is not accurate

    spliced_coverage_in_transcripts = float(
        spliced_coverage_in_transcripts) / 2

    ###########################
    # write out the results
    ###########################

    outf.write(str(int(total_alignments)) + "\t")
    # remove half of the coverage assigned to spliced reads
    coverage_in_transcripts = (coverage_in_transcripts) - (
        spliced_coverage_in_transcripts)
    outf.write(
        str(
            int(coverage_in_transcripts) -
            int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t")

    # write out spliced counts
    outf.write(str(int(spliced_alignments)) + "\t")
    outf.write(str(int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100)))

    outf.close()

    ############################
    # contextualise those that
    # don't fall in transcripts
    ############################

    if options.reference_gtf:
        context_summary = IOTools.open_file(
            IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w")
        context_summary.write("\t".join(["Feature", "number"]) + "\n")

        # write out the read info as well
        context_file = IOTools.open_file(
            IOTools.snip(options.bam_file, ".bam") + ".excluded", "w")

        context_dict = collections.defaultdict(int)
        # intersect bam - write non-overlapping with transcripts - intersect
        # with reference - write out
        context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect(
            pybedtools.BedTool(options.reference_gtf), wb=True)
        for entry in context:
            feature = entry[8]
            context_dict[feature] += 1
            context_file.write("\t".join([e for e in entry]) + "\n")

        for feature, value in context_dict.items():
            context_summary.write("\t".join([feature, str(value)]) + "\n")

        context_file.close()
        context_summary.close()

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 9
0
def run_report(clean=True,
               with_pipeline_status=True,
               pipeline_status_format="svg"):
    '''run CGATreport.

    This will also run ruffus to create an svg image of the pipeline
    status unless *with_pipeline_status* is set to False. The image
    will be saved into the export directory.

    '''

    params = P.get_params()

    if with_pipeline_status:
        targetdir = params["exportdir"]
        if not os.path.exists(targetdir):
            os.mkdir(targetdir)

        ruffus.pipeline_printout_graph(
            os.path.join(
                targetdir,
                "pipeline.%s" % pipeline_status_format),
            pipeline_status_format,
            ["full"],
            checksum_level=params["ruffus_checksums_level"]
        )

    dirname, basename = os.path.split(P.get_caller().__file__)

    report_engine = params.get("report_engine", "cgatreport")
    assert report_engine in ('sphinxreport', 'cgatreport')

    docdir = os.path.join(dirname, "pipeline_docs", IOTools.snip(basename, ".py"))
    themedir = os.path.join(dirname, "pipeline_docs", "themes")
    relpath = os.path.relpath(docdir)
    trackerdir = os.path.join(docdir, "trackers")

    # use a fake X display in order to avoid windows popping up
    # from R plots.
    xvfb_command = IOTools.which("xvfb-run")

    # permit multiple servers using -d option
    if xvfb_command:
        xvfb_command += " -d "
    else:
        xvfb_command = ""

    # if there is no DISPLAY variable set, xvfb runs, but
    # exits with error when killing process. Thus, ignore return
    # value.
    # print os.getenv("DISPLAY"), "command=", xvfb_command
    if not os.getenv("DISPLAY"):
        erase_return = "|| true"
    else:
        erase_return = ""

    if os.path.exists("conf.py"):
        conf_dir = os.path.abspath(".")
    else:
        conf_dir = os.path.join(os.path.dirname(__file__), "configuration")

    # in the current version, xvfb always returns with an error, thus
    # ignore these.
    erase_return = "|| true"

    if clean:
        clean = "rm -rf report _cache _static;"
    else:
        clean = ""

    # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as
    # the virtual environment seems to be stripped. It is thus set to
    # the contents of the current sys.path
    syspath = ":".join(sys.path)

    statement = '''
    %(clean)s
    (export SPHINX_DOCSDIR=%(docdir)s;
    export SPHINX_THEMEDIR=%(themedir)s;
    export PYTHONPATH=%(syspath)s;
    %(xvfb_command)s
    %(report_engine)s-build
    --num-jobs=%(report_threads)s
    sphinx-build
    -b html
    -d %(report_doctrees)s
    -c %(conf_dir)s
    -j %(report_threads)s
    %(docdir)s %(report_html)s
    >& report.log %(erase_return)s )
    '''

    P.run(statement)

    E.info('the report is available at %s' % os.path.abspath(
        os.path.join(params['report_html'], "contents.html")))