Esempio n. 1
0
 def setUp(self):
     print("Setting up...")
     # Create a fixture
     self.pp = pypiper.PipelineManager(name="sample_pipeline",
                                       outfolder="pipeline_output/",
                                       multi=False)
     self.pp2 = pypiper.PipelineManager(name="sample_pipeline2",
                                        outfolder="pipeline_output/",
                                        multi=True)
Esempio n. 2
0
 def setUp(self):
     """ Start each test case with two pipeline managers. """
     print("Setting up...")
     # Create a fixture
     self.pp = pypiper.PipelineManager("sample_pipeline",
                                       outfolder=self.OUTFOLDER,
                                       multi=True)
     self.pp2 = pypiper.PipelineManager("sample_pipeline2",
                                        outfolder=self.OUTFOLDER,
                                        multi=True)
Esempio n. 3
0
def main():
    args = parse_arguments()
    outfolder = os.path.abspath(os.path.join(args.output_parent, "summary"))

    pm = pypiper.PipelineManager(name="PEPATAC_collator",
                                 outfolder=outfolder,
                                 args=args,
                                 version=__version__)

    cmd = "Rscript {R_file} {config_file} {output_dir} {results_subdir}".format(
        R_file=tool_path("PEPATAC_summarizer.R"),
        config_file=args.config_file,
        output_dir=args.output_parent,
        results_subdir=args.results)
    if args.new_start:
        cmd += " --new-start"
    if args.skip_consensus:
        cmd += " --skip-consensus"
    if args.skip_table:
        cmd += " --skip-table"

    complexity_file = os.path.join(
        outfolder, "{name}_libComplexity.pdf".format(name=args.name))
    consensus_peaks_file = os.path.join(
        outfolder, "{name}_*_consensusPeaks.narrowPea".format(name=args.name))
    peak_coverage_file = os.path.join(
        outfolder, "{name}_peaks_coverage.tsv".format(name=args.name))

    pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file])
    pm.stop_pipeline()
Esempio n. 4
0
def main():
    # Parse command-line arguments
    parser = ArgumentParser(
        prog="hic-pipeline",
        description="Hi-C pipeline."
    )
    parser = arg_parser(parser)
    parser = pypiper.add_pypiper_args(parser, groups=["ngs", "looper", "resource", "pypiper"])
    args = parser.parse_args()

    # Read in yaml configs
    series = pd.Series(yaml.load(open(args.sample_config, "r")))

    # looper 0.6/0.7 compatibility:
    if "protocol" in series.index:
        key = "protocol"
    elif "library" in series.index:
        key = "library"
    else:
        raise KeyError(
            "Sample does not contain either a 'protocol' or 'library' attribute!")

    # Create Sample object
    if series[key] != "HiChIP":
        sample = HiCSample(series)
    else:
        sample = HiChIPSample(series)

    # Check if merged
    if len(sample.data_path.split(" ")) > 1:
        sample.merged = True
    else:
        sample.merged = False
    sample.prj = AttributeDict(sample.prj)
    sample.paths = AttributeDict(sample.paths.__dict__)

    # Check read type if not provided
    if not hasattr(sample, "ngs_inputs"):
        sample.ngs_inputs = [sample.data_source]
    if not hasattr(sample, "read_type"):
        sample.set_read_type()

    # Shorthand for read_type
    if sample.read_type == "paired":
        sample.paired = True
    else:
        sample.paired = False

    # Set file paths
    sample.set_file_paths()
    # sample.make_sample_dirs()  # should be fixed to check if values of paths are strings and paths indeed

    # Start Pypiper object
    # Best practice is to name the pipeline with the name of the script;
    # or put the name in the pipeline interface.
    pipe_manager = pypiper.PipelineManager(name="hic", outfolder=sample.paths.sample_root, args=args)
    pipe_manager.config.tools.scripts_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tools")

    # Start main function
    process(sample, pipe_manager, args)
Esempio n. 5
0
def main():
    # Parse command-line arguments
    parser = ArgumentParser(prog="atacseq-pipeline",
                            description="ATAC-seq pipeline.")
    parser = arg_parser(parser)
    parser = pypiper.add_pypiper_args(
        parser, groups=["ngs", "looper", "resource", "pypiper"])
    args = parser.parse_args()
    if args.sample_config is None or args.output_parent is None:
        parser.print_help()
        return 1

    # Read in yaml configs
    series = pd.Series(yaml.safe_load(open(args.sample_config, "r")))
    series["sample_root"] = args.output_parent
    print(series)
    # Create Sample object
    if series["protocol"] != "DNase-seq":
        sample = ATACseqSample(series)
    else:
        sample = DNaseSample(series)

    print(sample)
    # Check if merged
    if (type(sample.data_source) == list) & (len(sample.data_source) > 1):
        sample.merged = True
    else:
        sample.merged = False
    sample.paths = AttributeDict(sample.__dict__)

    # Check read type if not provided
    if not hasattr(sample, "ngs_inputs"):
        sample.ngs_inputs = [sample.data_source]
    if not hasattr(sample, "read_type"):
        sample.set_read_type()

    # Shorthand for read_type
    if sample.read_type == "paired":
        sample.paired = True
    else:
        sample.paired = False

    # Set file paths
    sample.set_file_paths()

    # Start Pypiper object
    # Best practice is to name the pipeline with the name of the script;
    # or put the name in the pipeline interface.
    pipe_manager = pypiper.PipelineManager(name="atacseq",
                                           outfolder=sample.sample_root,
                                           args=args)
    pipe_manager.config.tools.scripts_dir = pjoin(
        os.path.dirname(os.path.realpath(__file__)), "tools")

    # Start main function
    process(sample, pipe_manager, args)
Esempio n. 6
0
def main():
    # Parse command-line arguments
    parser = ArgumentParser(prog="chipseq-pipeline",
                            description="ChIP-seq pipeline.")
    parser = arg_parser(parser)
    parser = pypiper.add_pypiper_args(
        parser, groups=["ngs", "looper", "resource", "pypiper"])
    args = parser.parse_args()
    if args.sample_config is None:
        parser.print_help()
        return 1

    # Read in yaml configs
    series = pd.Series(yaml.safe_load(open(args.sample_config, "r")))
    # Create Sample object
    if series["protocol"] == "ChIPmentation":
        sample = ChIPmentation(series)
    else:
        sample = ChIPseqSample(series)

    # Check if merged
    if len(sample.data_source.split(" ")) > 1:
        sample.merged = True
    else:
        sample.merged = False
    sample.prj = AttributeDict(sample.prj)
    sample.paths = AttributeDict(sample.paths.__dict__)

    # Check read type if not provided
    if not hasattr(sample, "ngs_inputs"):
        sample.ngs_inputs = [sample.data_source]
    if not hasattr(sample, "read_type"):
        sample.set_read_type()

    # Shorthand for read_type
    if sample.read_type == "paired":
        sample.paired = True
    else:
        sample.paired = False

    # Set file paths
    sample.set_file_paths(sample.prj)

    # Start Pypiper object
    # Best practice is to name the pipeline with the name of the script;
    # or put the name in the pipeline interface.
    pipe_manager = pypiper.PipelineManager(name="chipseq",
                                           outfolder=sample.paths.sample_root,
                                           args=args)

    # Start main function
    process(sample, pipe_manager, args)
Esempio n. 7
0
def main():
    # Parse command-line arguments
    parser = ArgumentParser(prog="starrseq-pipeline",
                            description="STARR-seq pipeline.")
    parser = arg_parser(parser)
    parser = pypiper.add_pypiper_args(
        parser, groups=["ngs", "looper", "resource", "pypiper"])
    args = parser.parse_args()

    # Read in yaml configs
    sample = STARRSeqSample(pd.Series(yaml.load(open(args.sample_config,
                                                     "r"))))

    # Check if merged
    if len(sample.data_path.split(" ")) > 1:
        sample.merged = True
    else:
        sample.merged = False
    sample.prj = AttributeDict(sample.prj)
    sample.paths = AttributeDict(sample.paths.__dict__)

    # Check read type if not provided
    if not hasattr(sample, "ngs_inputs"):
        sample.ngs_inputs = [sample.data_source]
    if not hasattr(sample, "read_type"):
        sample.set_read_type()

    # Shorthand for read_type
    if sample.read_type == "paired":
        sample.paired = True
    else:
        sample.paired = False

    # Set file paths
    sample.set_file_paths()
    # sample.make_sample_dirs()  # should be fixed to check if values of paths are strings and paths indeed

    # Start Pypiper object
    # Best practice is to name the pipeline with the name of the script;
    # or put the name in the pipeline interface.
    pipe_manager = pypiper.PipelineManager(name="starrseq",
                                           outfolder=sample.paths.sample_root,
                                           args=args)
    pipe_manager.config.tools.scripts_dir = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "tools")

    # Start main function
    process(sample, pipe_manager, args)
Esempio n. 8
0
def main():
    # Parse command-line arguments
    parser = ArgumentParser(prog="starrseq-pipeline",
                            description="STARR-seq pipeline.")
    parser = arg_parser(parser)
    parser = pypiper.add_pypiper_args(parser, all_args=True)
    args = parser.parse_args()
    if args.sample_config is None:
        parser.print_help()
        return 1

    # Read in yaml config and create Sample object
    sample = STARRseqSample(pd.Series(yaml.load(open(args.sample_config,
                                                     "r"))))

    # Check if merged
    if len(sample.data_source.split(" ")) > 1:
        sample.merged = True
    else:
        sample.merged = False
    sample.prj = AttributeDict(sample.prj)
    sample.paths = AttributeDict(sample.paths.__dict__)

    # Shorthand for read_type
    if sample.read_type == "paired":
        sample.paired = True
    else:
        sample.paired = False

    # Set file paths
    sample.set_file_paths()
    sample.make_sample_dirs()

    # Start Pypiper object
    # Best practice is to name the pipeline with the name of the script;
    # or put the name in the pipeline interface.
    pipe_manager = pypiper.PipelineManager(name="starrseq",
                                           outfolder=sample.paths.sample_root,
                                           args=args)

    # Start main function
    process(sample, pipe_manager, args)
Esempio n. 9
0
def main():
    """Run the script."""
    cmdl = sys.argv[1:]
    args = _parse_cmdl(cmdl)
    global _LOGGER
    _LOGGER = logmuse.logger_via_cli(args)
    delete_sra = False  # initialize to False
    # Name the pipeline run after the first element to convert.
    # Maybe we should just have a separate pipeline for each file?

    if args.sample_name:
        run_name = "_".join(uniqify(args.sample_name))
    else:
        primary_srr_acc = os.path.splitext(os.path.basename(args.srr[0]))[0]
        run_name = primary_srr_acc

    if args.output_parent:
        outfolder = os.path.join(args.output_parent, run_name)
    else:
        outfolder = os.path.join(args.srafolder, "sra_convert_pipeline",
                                 run_name)

    _LOGGER.info("Using outfolder: {}".format(outfolder))
    nfiles = len(args.srr)
    failed_files = []

    pm = pypiper.PipelineManager(name="sra_convert",
                                 outfolder=outfolder,
                                 args=args)

    for i in range(nfiles):
        srr_acc = os.path.splitext(os.path.basename(args.srr[i]))[0]
        pm.info("Processing {} of {} files: {}".format(str(i + 1), str(nfiles),
                                                       srr_acc))

        bamfile = os.path.join(args.bamfolder, srr_acc + ".bam")
        fq_prefix = os.path.join(args.fqfolder, srr_acc)

        if args.mode == "convert":
            infile = args.srr[i]
            if not os.path.isfile(infile):
                pm.warning("Couldn't find sra file at: {}.".format(infile))
                failed_files.append(args.srr[i])
            if args.format == "fastq":
                # fastq-dump --split-files will produce *_1.fastq and *_2.fastq
                # for paired-end data, and only *_1.fastq for single-end data.
                outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix)
                cmd = "fastq-dump {data_source} --split-files --gzip -O {outfolder}".format(
                    data_source=infile, outfolder=args.fqfolder, nofail=True)
            elif args.format == "bam":
                outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam")
                cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format(
                    data_source=infile, outfile=outfile, nofail=True)
            else:
                raise KeyError("Unknown format: {}".format(args.format))

            target = outfile
            ret = pm.run(cmd, target=target)
            if ret == 0:
                pm.info("Already completed files: {}".format(failed_files))
                try:
                    failed_files.remove(infile)
                except:
                    pass

        elif args.mode == "delete_bam":
            pm.timestamp("Cleaning bam file: {}".format(bamfile))
            pm.clean_add(bamfile)
        elif args.mode == "delete_fq":
            pm.timestamp("Cleaning fastq file(s): {}*".format(fq_prefix))
            fq_prefix = os.path.join(args.fqfolder, srr_acc)
            pm.clean_add("{fq_prefix}.fastq.gz".format(fq_prefix=fq_prefix))
            pm.clean_add(
                "{fq_prefix}_[0-9].fastq.gz".format(fq_prefix=fq_prefix))
        elif args.mode == "delete_sra":
            delete_sra = True
            # if specifically requested to delete sra files

        if not args.keep_sra and os.path.isfile(outfile):
            # Only delete if the output file was created...
            # we can't trust the sra toolkit return codes because they
            # can return 0 even if the command didn't complete, causing us to
            # delete the sra file when we have no other copy of that data.
            delete_sra = True

        if delete_sra:
            pm.timestamp("Cleaning sra file: {}".format(infile))
            pm.clean_add(infile)

    if len(failed_files) > 0:
        pm.fail_pipeline(
            Exception("Unable to locate the following files: {}".format(
                ",".join(failed_files))))

    pm.stop_pipeline()
Esempio n. 10
0
#!/usr/bin/env python

import pypiper
outfolder = "hello_pypiper_results"  # Choose a folder for your results
pm = pypiper.PipelineManager(name="hello_pypiper", outfolder=outfolder)

pm.timestamp("Hello!")
target_file = "hello_pypiper_results/output.txt"
cmd = "echo 'Hello, Pypiper!' > " + target_file
pm.run(cmd, target_file)

pm.stop_pipeline()
Esempio n. 11
0
def process(sample, pipeline_config, args):
    """
    """
    print("Start processing Drop-seq sample %s." % sample.sample_name)

    for path in ["sample_root"] + sample.paths.__dict__.keys():
        try:
            exists = os.path.exists(sample.paths[path])
        except TypeError:
            continue
        if not exists:
            try:
                os.mkdir(sample.paths[path])
            except OSError("Cannot create '%s' path: %s" %
                           (path, sample.paths[path])):
                raise

    # Start Pypiper object
    pipe = pypiper.PipelineManager("dropseq",
                                   sample.paths.sample_root,
                                   args=args)

    # Set up a few handy shorthand variables
    dropseq_root = pipe.config.tools.dropseq_tools_root
    output_dir = sample.paths.sample_root

    # Merge Bam files if more than one technical replicate
    if len(sample.data_path.split(" ")) > 1:
        pipe.timestamp("## Merging bam files from replicates")
        cmd = merge_bam_files(
            inputBams=sample.data_path.split(
                " "),  # this is a list of sample paths
            outputBam=os.path.join(output_dir, "unaligned_merged.bam"),
            args=args,
            pipe=pipe,
            tmpdir=output_dir)
        pipe.run(cmd, os.path.join(output_dir, "unaligned_merged.bam"))
        pipe.clean_add(os.path.join(output_dir, "unaligned_merged.bam"),
                       manual=True)

        input_file = os.path.join(output_dir, "unaligned_merged.bam")
    else:
        input_file = sample.data_path

    # Copy the input file if it is not writable
    # (the first step requires the file to be writable which is silly)
    if not os.access(input_file, os.W_OK):
        pipe.timestamp("## Copying input file to output directory")
        cmd = "cp {} {}".format(input_file,
                                os.path.join(output_dir, "input_file.bam"))
        pipe.run(cmd, os.path.join(output_dir, "input_file.bam"))
        cmd = "chmod 664 {}".format(os.path.join(output_dir, "input_file.bam"))
        pipe.run(cmd, os.path.join(output_dir, "input_file.bam_chmod"))
        pipe.clean_add(os.path.join(output_dir, "input_file.bam"),
                       manual=False)
        input_file = os.path.join(output_dir, "input_file.bam")

    os.environ['TMP_DIR'] = output_dir

    if args.debug:
        report_flagstat(pipe,
                        os.path.join(output_dir, input_file),
                        prefix="input_file")

    # Stage 1: pre-alignment tag and trim
    # Tag with cell barcode
    pipe.timestamp("## Tagging BAM file with cell barcode")
    cmd = os.path.join(dropseq_root, "TagBamWithReadSequenceExtended")
    cmd += " TMP_DIR=" + output_dir
    cmd += " SUMMARY=" + os.path.join(
        output_dir, "unaligned_tagged_Cellular.bam_summary.txt")
    cmd += " BASE_RANGE={}".format(pipe.config.parameters.cell_barcode_bases)
    cmd += " BASE_QUALITY={}".format(pipe.config.parameters.min_base_quality)
    cmd += " BARCODED_READ=1 DISCARD_READ=false TAG_NAME=XC NUM_BASES_BELOW_QUALITY={}".format(
        pipe.config.parameters.min_bases_below_quality)
    cmd += " INPUT=" + input_file
    cmd += " OUTPUT=" + os.path.join(output_dir, "unaligned_tagged_Cell.bam")
    pipe.run(cmd, os.path.join(output_dir, "unaligned_tagged_Cell.bam"))
    pipe.clean_add(os.path.join(output_dir, "unaligned_tagged_Cell.bam"),
                   manual=True)

    # Tag with molecule barcode
    pipe.timestamp("## Tagging BAM file with molecule barcode (UMI)")
    cmd = os.path.join(dropseq_root, "TagBamWithReadSequenceExtended")
    cmd += " TMP_DIR=" + output_dir
    cmd += " SUMMARY=" + os.path.join(
        output_dir, "unaligned_tagged_Molecular.bam_summary.txt")
    cmd += " BASE_RANGE={}".format(pipe.config.parameters.umi_barcode_bases)
    cmd += " BASE_QUALITY={}".format(pipe.config.parameters.min_base_quality)
    cmd += " BARCODED_READ=1 DISCARD_READ=true TAG_NAME=XM NUM_BASES_BELOW_QUALITY={}".format(
        pipe.config.parameters.min_bases_below_quality)
    cmd += " INPUT=" + os.path.join(output_dir, "unaligned_tagged_Cell.bam")
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "unaligned_tagged_CellMolecular.bam")
    pipe.run(cmd, os.path.join(output_dir,
                               "unaligned_tagged_CellMolecular.bam"))
    pipe.clean_add(os.path.join(output_dir,
                                "unaligned_tagged_CellMolecular.bam"),
                   manual=True)

    # Filter bam
    pipe.timestamp("## Filtering BAM file")
    cmd = os.path.join(dropseq_root, "FilterBAM")
    cmd += " TAG_REJECT=XQ"
    cmd += " INPUT=" + os.path.join(output_dir,
                                    "unaligned_tagged_CellMolecular.bam")
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "unaligned_tagged_filtered.bam")
    pipe.run(cmd, os.path.join(output_dir, "unaligned_tagged_filtered.bam"))
    pipe.clean_add(os.path.join(output_dir, "unaligned_tagged_filtered.bam"),
                   manual=True)

    if args.debug:
        report_flagstat(pipe,
                        os.path.join(output_dir,
                                     "unaligned_tagged_filtered.bam"),
                        prefix="FilterBAM")

    # Trim starting sequence
    pipe.timestamp("## Triming starting sequence")
    cmd = os.path.join(dropseq_root, "TrimStartingSequence")
    cmd += " SEQUENCE={}".format(pipe.config.parameters.trim_sequence)
    cmd += " MISMATCHES=0 NUM_BASES={}".format(
        pipe.config.parameters.trim_sequence_length)
    cmd += " OUTPUT_SUMMARY=" + os.path.join(output_dir,
                                             "adapter_trimming_report.txt")
    cmd += " INPUT=" + os.path.join(output_dir,
                                    "unaligned_tagged_filtered.bam")
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "unaligned_tagged_trimmed_smart.bam")
    pipe.run(cmd, os.path.join(output_dir,
                               "unaligned_tagged_trimmed_smart.bam"))
    pipe.clean_add(os.path.join(output_dir,
                                "unaligned_tagged_trimmed_smart.bam"),
                   manual=True)

    if args.debug:
        report_flagstat(pipe,
                        os.path.join(output_dir,
                                     "unaligned_tagged_trimmed_smart.bam"),
                        prefix="TrimStartingSequence")

    # Trim polyA tail
    pipe.timestamp("## Trimming polyA tail")
    cmd = os.path.join(dropseq_root, "PolyATrimmer")
    cmd += " MISMATCHES=0 NUM_BASES={}".format(
        pipe.config.parameters.polya_size)
    cmd += " OUTPUT_SUMMARY=" + os.path.join(output_dir,
                                             "polyA_trimming_report.txt")
    cmd += " INPUT=" + os.path.join(output_dir,
                                    "unaligned_tagged_trimmed_smart.bam")
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "unaligned_mc_tagged_polyA_filtered.bam")
    pipe.run(
        cmd, os.path.join(output_dir,
                          "unaligned_mc_tagged_polyA_filtered.bam"))
    pipe.clean_add(os.path.join(output_dir,
                                "unaligned_mc_tagged_polyA_filtered.bam"),
                   manual=True)

    if args.debug:
        report_flagstat(pipe,
                        os.path.join(output_dir,
                                     "unaligned_mc_tagged_polyA_filtered.bam"),
                        prefix="PolyATrimmer")

    # Stage 2: alignment
    # Convert to fastq
    pipe.timestamp("## Converting to Fastq")
    cmd = "java -Xmx{}g -jar {} SamToFastq".format(
        int(args.mem) / 1000, pipe.config.tools.piccard_jar)
    cmd += " INPUT=" + os.path.join(output_dir,
                                    "unaligned_mc_tagged_polyA_filtered.bam")
    cmd += " FASTQ=" + os.path.join(
        output_dir, "unaligned_mc_tagged_polyA_filtered.fastq")
    pipe.run(
        cmd,
        os.path.join(output_dir, "unaligned_mc_tagged_polyA_filtered.fastq"))
    pipe.clean_add(os.path.join(output_dir,
                                "unaligned_mc_tagged_polyA_filtered.fastq"),
                   manual=True)

    # Align reads
    pipe.timestamp("## Aligning reads with STAR")
    cmd = pipe.config.tools.star
    cmd += " --genomeDir {}".format(
        getattr(pipe.config.resources.star_index, sample.genome))
    cmd += " --runThreadN {}".format(args.cores)
    cmd += " --outFileNamePrefix " + os.path.join(output_dir, "star.")
    cmd += " --readFilesIn " + os.path.join(
        output_dir, "unaligned_mc_tagged_polyA_filtered.fastq")
    pipe.run(cmd, os.path.join(output_dir, "star.Aligned.out.sam"))
    pipe.clean_add(os.path.join(output_dir, "star.Aligned.out.sam"),
                   manual=True)

    if args.debug:
        report_star_log(pipe,
                        os.path.join(output_dir, "star.Log.final.out"),
                        prefix="STAR")

    # Stage 3: sort aligned reads (STAR does not necessarily emit reads in the same order as the input)
    pipe.timestamp("## Sorting aligned BAM file")
    cmd = "java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx{}g".format(
        int(args.mem) / 1000)
    cmd += " -jar {} SortSam".format(pipe.config.tools.piccard_jar)
    cmd += " INPUT=" + os.path.join(output_dir, "star.Aligned.out.sam")
    cmd += " OUTPUT=" + os.path.join(output_dir, "aligned.sorted.bam")
    cmd += " SORT_ORDER=queryname"
    cmd += " TMP_DIR=" + output_dir
    pipe.run(cmd, os.path.join(output_dir, "aligned.sorted.bam"))
    pipe.clean_add(os.path.join(output_dir, "aligned.sorted.bam"), manual=True)

    # Stage 4: merge and tag aligned reads
    # Merge
    pipe.timestamp("## Merging aligned with unaligned reads")
    cmd = "java -Djava.io.tmpdir={} -Xmx{}g -jar {} MergeBamAlignment".format(
        output_dir,
        int(args.mem) / 1000, pipe.config.tools.piccard_jar)
    cmd += " REFERENCE_SEQUENCE={}".format(
        getattr(pipe.config.resources.genome, sample.genome))
    cmd += " UNMAPPED_BAM=" + os.path.join(
        output_dir, "unaligned_mc_tagged_polyA_filtered.bam")
    cmd += " ALIGNED_BAM=" + os.path.join(output_dir, "aligned.sorted.bam")
    cmd += " INCLUDE_SECONDARY_ALIGNMENTS=false"
    cmd += " ALIGNED_READS_ONLY=false"
    cmd += " PAIRED_RUN=false"
    cmd += " OUTPUT=" + os.path.join(output_dir, "merged.bam")
    pipe.run(cmd, os.path.join(output_dir, "merged.bam"))
    pipe.clean_add(os.path.join(output_dir, "merged.bam"), manual=True)

    if args.debug:
        report_flagstat(pipe,
                        os.path.join(output_dir, "merged.bam"),
                        prefix="MergeBamAlignment")

    # Tag reads with exon
    pipe.timestamp("## Tagging reads with exon")
    cmd = os.path.join(dropseq_root, "TagReadWithGeneExon")
    cmd += " OUTPUT=" + os.path.join(output_dir, "star_gene_exon_tagged.bam")
    cmd += " ANNOTATIONS_FILE={}".format(
        getattr(pipe.config.resources.refflat, sample.genome))
    cmd += " TAG=GE CREATE_INDEX=true"
    cmd += " INPUT=" + os.path.join(output_dir, "merged.bam")
    pipe.run(cmd, os.path.join(output_dir, "star_gene_exon_tagged.bam"))

    if args.debug:
        report_flagstat(pipe,
                        os.path.join(output_dir, "star_gene_exon_tagged.bam"),
                        prefix="TagReadWithGeneExon")

    # QC time!

    if pipe.config.parameters.repair_barcodes:
        # Detect and fix bead synthesis errors
        pipe.timestamp("## Reporting and fixing bead synthesis errors")
        cmd = os.path.join(dropseq_root, "DetectBeadSynthesisErrors")
        cmd += " INPUT=" + os.path.join(output_dir,
                                        "star_gene_exon_tagged.bam")
        cmd += " OUTPUT=" + os.path.join(output_dir,
                                         "star_gene_exon_tagged.clean.bam")
        cmd += " OUTPUT_STATS=" + os.path.join(output_dir,
                                               "synthesis_statistics.txt")
        cmd += " SUMMARY=" + os.path.join(output_dir,
                                          "synthesis_statistics.summary.txt")
        cmd += " NUM_BARCODES={}".format(
            pipe.config.parameters.number_seq_error_barcodes_check)
        cmd += " PRIMER_SEQUENCE={}".format(
            pipe.config.parameters.bead_primer_sequence)
        cmd += " EDIT_DISTANCE={}".format(
            pipe.config.parameters.distance_to_bead_primer_seq)
        cmd += " MAX_NUM_ERRORS={}".format(
            pipe.config.parameters.max_number_barcode_bases_to_repair)
        cmd += " TMP_DIR=" + output_dir
        pipe.run(cmd,
                 os.path.join(output_dir, "star_gene_exon_tagged.clean.bam"))

        if args.debug:
            report_bead_synthesis(pipe,
                                  os.path.join(
                                      output_dir,
                                      "synthesis_statistics.summary.txt"),
                                  prefix="DetectBeadSynthesisErrors")

        bam_file = os.path.join(output_dir, "star_gene_exon_tagged.clean.bam")
    else:
        bam_file = os.path.join(output_dir, "star_gene_exon_tagged.bam")

    # Distribution of read quality
    # cell barcode
    pipe.timestamp("## Read quality in cell barcodes")
    cmd = os.path.join(dropseq_root, "GatherReadQualityMetrics")
    cmd += " INPUT=" + bam_file
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "quality_distribution.cell_barcode.txt")
    cmd += " TAG=XC"
    pipe.run(cmd,
             os.path.join(output_dir, "quality_distribution.cell_barcode.txt"))
    # UMI
    pipe.timestamp("## Read quality in molecule barcodes")
    cmd = os.path.join(dropseq_root, "GatherReadQualityMetrics")
    cmd += " INPUT=" + bam_file
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "quality_distribution.mol_barcode.txt")
    cmd += " TAG=XM"
    pipe.run(cmd,
             os.path.join(output_dir, "quality_distribution.mol_barcode.txt"))

    # Distribution of bases in reads
    # cell barcode
    pipe.timestamp("## Distribution of bases in cell barcodes")
    cmd = os.path.join(dropseq_root, "BaseDistributionAtReadPosition")
    cmd += " INPUT=" + bam_file
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "base_distribution.cell_barcode.txt")
    cmd += " TAG=XC"
    pipe.run(cmd, os.path.join(output_dir,
                               "base_distribution.cell_barcode.txt"))
    # UMI
    pipe.timestamp("## Distribution of bases in molecule barcodes (UMI)")
    cmd = os.path.join(dropseq_root, "BaseDistributionAtReadPosition")
    cmd += " INPUT=" + bam_file
    cmd += " OUTPUT=" + os.path.join(output_dir,
                                     "base_distribution.mol_barcode.txt")
    cmd += " TAG=XM"
    pipe.run(cmd, os.path.join(output_dir,
                               "base_distribution.mol_barcode.txt"))

    # Expression time!

    # Reads per cell summary
    pipe.timestamp("## Reporting summary of reads per cell")
    cmd = os.path.join(dropseq_root, "BAMTagHistogram")
    cmd += " INPUT=" + bam_file
    cmd += " OUTPUT=" + os.path.join(output_dir, "cell_readcounts.txt")
    cmd += " FILTER_PCR_DUPLICATES=true"
    cmd += " TAG=XC"
    pipe.run(cmd, os.path.join(output_dir, "cell_readcounts.txt"))

    if args.debug:
        report_flagstat(pipe, bam_file, prefix="BAMTagHistogram")

    # Perform digital gene expression analysis selecting all cells that have at least minGenes genes covered
    for n_genes in pipe.config.parameters.min_genes_per_cell:

        pipe.timestamp(
            "## Perform digital gene expression analysis for cells with at least {} genes covered"
            .format(n_genes))
        cmd = os.path.join(dropseq_root, "DigitalExpression")
        cmd += " -m {}g".format(int(args.mem) / 1000)
        cmd += " TMP_DIR=" + output_dir
        cmd += " INPUT=" + bam_file
        cmd += " OUTPUT=" + os.path.join(
            output_dir, "digital_expression.{}genes.tsv".format(n_genes))
        cmd += " SUMMARY=" + os.path.join(
            output_dir,
            "digital_expression.summary.{}genes.tsv".format(n_genes))
        cmd += " MIN_NUM_GENES_PER_CELL={}".format(n_genes)
        pipe.run(
            cmd,
            os.path.join(output_dir,
                         "digital_expression.{}genes.tsv".format(n_genes)),
            nofail=True)

        if args.debug:
            if os.path.exists(
                    os.path.join(
                        output_dir,
                        "digital_expression.{}genes.tsv".format(n_genes))):
                try:
                    print(
                        "Reporting digital expression for cells with at least {} genes covered"
                        .format(n_genes))
                    report_digital_expression(
                        pipe,
                        os.path.join(
                            output_dir,
                            "digital_expression.{}genes.tsv".format(n_genes)),
                        prefix="DigitalExpression_{}genes".format(n_genes))
                except IOError:
                    print(
                        "Digital expression for cells with at least {} genes covered could not be open."
                        .format(n_genes))

    # Report how often the same UMI is found per cell per gene --> estimate of PCR duplicates
    for n_genes in pipe.config.parameters.min_genes_per_cell:
        pipe.timestamp(
            "## Report UMI count per cell per gene for cells with at least {} genes covered"
            .format(n_genes))
        cmd = os.path.join(dropseq_root,
                           "GatherMolecularBarcodeDistributionByGene")
        cmd += " -m {}g".format(int(args.mem) / 1000)
        cmd += " TMP_DIR=" + output_dir
        cmd += " INPUT=" + bam_file
        cmd += " OUTPUT=" + os.path.join(
            output_dir, "cell_umi_barcodes.{}genes.tsv".format(n_genes))
        cmd += " MIN_NUM_GENES_PER_CELL={}".format(n_genes)
        pipe.run(
            cmd,
            os.path.join(output_dir,
                         "cell_umi_barcodes.{}genes.tsv".format(n_genes)))

    print("Finished processing sample %s." % sample.sample_name)
    pipe.stop_pipeline()
                    default="0",
                    dest='stopN',
                    type=int,
                    help='Run the first N commadlines')
args = parser.parse_args()
count_steps = 0
# it always paired seqencung for ATACseq
if args.single_or_paired == "paired":
    args.paired_end = True
else:
    args.paired_end = False

# Initialize
pm = pypiper.PipelineManager(name="scATAC_mtSMC",
                             outfolder=os.path.abspath(
                                 os.path.join(args.output_parent, "sc_output",
                                              args.sample_name)),
                             args=args)
# Convenience alias
tools = pm.config.tools
param = pm.config.parameters
res = pm.config.resources
# Set up reference resouce according to genome prefix.
res.ref_genome_fasta = pm.config.resources.ref_pref
res.ref_chrMT_fasta = pm.config.resources.chrM
output = os.path.join(args.output_parent, "sc_output")
output = os.path.join(output, args.sample_name + "/")
param.outfolder = output

################################################################################
print("Local input file: " + args.input[0])
Esempio n. 13
0
def main():
    # Parse command-line arguments
    parser = ArgumentParser(prog="chipseq-pipeline",
                            description="ChIP-seq pipeline.")
    parser = arg_parser(parser)
    parser = pypiper.add_pypiper_args(parser, groups=["all"])
    args = parser.parse_args()
    if args.sample_config is None:
        parser.print_help()
        return 1

    # Read in yaml configs
    series = pd.Series(yaml.load(open(args.sample_config, "r")))

    # looper 0.6/0.7 compatibility:
    if "protocol" in series.index:
        key = "protocol"
    elif "library" in series.index:
        key = "library"
    else:
        raise KeyError(
            "Sample does not contain either a 'protocol' or 'library' attribute!"
        )

    # Create Sample object
    if series[key] != "ChIPmentation":
        sample = ChIPseqSample(series)
    else:
        sample = ChIPmentation(series)

    # Check if merged
    if len(sample.data_path.split(" ")) > 1:
        sample.merged = True
    else:
        sample.merged = False
    sample.prj = AttributeDict(sample.prj)
    sample.paths = AttributeDict(sample.paths.__dict__)

    # Check read type if not provided
    if not hasattr(sample, "ngs_inputs"):
        sample.ngs_inputs = [sample.data_source]
    if not hasattr(sample, "read_type"):
        sample.set_read_type()

    # Shorthand for read_type
    if sample.read_type == "paired":
        sample.paired = True
    else:
        sample.paired = False

    # Set file paths
    sample.set_file_paths()
    # sample.make_sample_dirs()  # should be fixed to check if values of paths are strings and paths indeed

    # Start Pypiper object
    # Best practice is to name the pipeline with the name of the script;
    # or put the name in the pipeline interface.
    pipe_manager = pypiper.PipelineManager(name="chipseq",
                                           outfolder=sample.paths.sample_root,
                                           args=args)

    # Start main function
    if not args.only_peaks:
        pipe_manager = process(sample, pipe_manager, args)
    else:
        print("Skipped processing sample '{}'.".format(sample.name))

    # If sample does not have "ctrl" attribute, finish processing it.
    if not hasattr(sample, "compare_sample"):
        pipe_manager.stop_pipeline()
        print("Finished processing sample '{}'.".format(sample.name))
        return

    # The pipeline will now wait for the comparison sample file to be completed
    pipe_manager._wait_for_file(
        sample.filtered.replace(sample.name, sample.compare_sample))

    # Start peak calling function
    call_peaks(sample, pipe_manager, args)
Esempio n. 14
0
    def _build_asset(
        genome,
        asset_key,
        tag,
        build_pkg,
        genome_outfolder,
        specific_args,
        specific_params,
        alias,
        **kwargs,
    ):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """

        log_outfolder = os.path.abspath(
            os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR))
        _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(
            genome_outfolder, log_outfolder))
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                "Insufficient permissions to write to output folder: {}".
                format(genome_outfolder))
            return

        pm = pypiper.PipelineManager(name="refgenie",
                                     outfolder=log_outfolder,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        # create a bundle list to simplify calls below
        gat = [genome, asset_key, tag]
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(
            genome,
            asset_key,
            tag,
            genome_outfolder,
            specific_args,
            specific_params,
            **kwargs,
        )
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(log_outfolder,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(log_outfolder, recipe_file_name),
                      "w") as outfile:
                json.dump(build_pkg, outfile)
            # since the assets are always built to a standard dir structure, we
            # can just stitch a path together for asset digest calculation
            asset_dir = os.path.join(rgc.data_dir, *gat)
            if not os.path.exists(asset_dir):
                raise OSError("Could not compute asset digest. Path does not "
                              "exist: {}".format(asset_dir))
            digest = get_dir_digest(asset_dir)
            _LOGGER.info("Asset digest: {}".format(digest))
            # add updates to config file
            with rgc as r:
                if asset_key == "fasta":
                    r.update_genomes(genome,
                                     data={CFG_ALIASES_KEY: [alias]},
                                     force_digest=genome)
                r.update_assets(
                    *gat[0:2],
                    data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
                    force_digest=genome,
                )
                r.update_tags(
                    *gat,
                    force_digest=genome,
                    data={
                        CFG_ASSET_PATH_KEY: asset_key,
                        CFG_ASSET_CHECKSUM_KEY: digest,
                    },
                )
                r.update_seek_keys(
                    *gat,
                    force_digest=genome,
                    keys={
                        k: v.format(**asset_vars)
                        for k, v in build_pkg[ASSETS].items()
                    },
                )
                r.set_default_pointer(*gat, force_digest=genome)
        pm.stop_pipeline()
        return True
Esempio n. 15
0
                  "--log",
                  dest="trinitylog",
                  help="log file",
                  action="store",
                  type="string")
parser.add_option("-s",
                  "--sample",
                  dest="sam",
                  help="trinity sample file",
                  action="store",
                  type="string")

(options, args) = parser.parse_args()

if os.path.exists(options.outfolder):
    os.removedirs(options.outfolder)

pm = pypiper.PipelineManager(name=options.pipename,
                             outfolder=options.outfolder)

pm.timestamp("start assembly using trinity!")

command = "Trinity --seqType fq --max_memory {} --output {} --samples_file {} --CPU {}".format(
    options.mem, options.outfolder, options.sam, options.cpu)

target_file = options.trinitylog

pm.run(command, target_file)

pm.stop_pipeline()
Esempio n. 16
0
def safe_echo(var):
	""" Returns an environment variable if it exists, or an empty string if not"""
	return os.getenv(var, "")


#def main(cmdl):

if __name__ == "__main__":
	#main(sys.argv[1:])
	cmdl = sys.argv[1:]
	args = _parse_cmdl(cmdl)

	key=args.srr[0]
	pm = pypiper.PipelineManager(	name="sra_convert",
									outfolder=args.srafolder,
									args=args)

	nfiles = len(args.srr)
	for i in range(nfiles):
		print("Processing " + str(i+1) + " of " + str(nfiles))
		infile = args.srr[i]
		srr_acc = os.path.splitext(os.path.basename(args.srr[i]))[0]
		outfile = os.path.join(args.bamfolder, srr_acc + ".bam")
		if (not os.path.isfile(infile)):
			infile = os.path.join(args.srafolder, args.srr[i] + ".sra")
			outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam")
		if (not os.path.isfile(infile)):
			next

			
Esempio n. 17
0
def refgenie_build(rgc, args):
    """
    Runs the refgenie build recipe.
    
    :param refgenconf.RefGenConf rgc: genome configuration instance
    :param argparse.Namespace args: parsed command-line options/arguments
    """

    # Build specific args

    specific_args = {k: getattr(args, k) for k in BUILD_SPECIFIC_ARGS}

    if args.genome:
        genome = args.genome
    else:
        # This can probably be eliminated now that with flexible building
        genome = os.path.basename(args.input)
        # eliminate extensions to get canonical genome name.
        for strike in [
                ".fasta.gz$", ".fa.gz$", ".fasta$", ".fa$", ".gz$", ".2bit$"
        ]:
            genome = re.sub(strike, "", genome)

    _LOGGER.info("Using genome name: {}".format(genome))

    if not hasattr(args, "outfolder") or not args.outfolder:
        # Default to genome_folder
        _LOGGER.debug("No outfolder provided, using genome config.")
        args.outfolder = rgc.genome_folder

    outfolder = os.path.abspath(os.path.join(args.outfolder, genome))
    if not _writeable(outfolder):
        _LOGGER.error(
            "Insufficient permissions to write to output folder: {}".format(
                outfolder))
        return

    _LOGGER.info("Output to: {} {} {}".format(genome, args.outfolder,
                                              outfolder))
    _LOGGER.debug("Default config file: {}".format(default_config_file()))

    if args.config_file and not os.path.isfile(args.config_file):
        _LOGGER.debug("Config file path isn't a file: {}".format(
            args.config_file))
        args.config_file = default_config_file()

    def path_data(root, c):
        return {"path": os.path.relpath(root, c.genome_folder)}

    def build_asset(genome, asset_key, asset_build_package, outfolder,
                    specific_args):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict asset_build_package: A dict (see examples) specifying lists
            of required inputs, commands to run, and outputs to register as
            assets.
        """
        _LOGGER.debug("Asset build package: " + str(asset_build_package))
        get_asset_vars(genome, asset_key, outfolder, specific_args)

        print(
            str([
                x.format(**asset_vars)
                for x in asset_build_package["command_list"]
            ]))

        tk.make_dir(asset_outfolder)
        target = os.path.join(asset_outfolder, "build_complete.flag")
        command_list_populated = [
            x.format(**asset_vars) for x in asset_build_package["command_list"]
        ]

        touch_target = "touch {target}".format(target=target)
        command_list_populated.append(touch_target)

        _LOGGER.debug("Command list populated: " + str(command_list_populated))

        pm.run(command_list_populated, target, container=pm.container)
        # Add index information to rgc
        for asset_key, relative_path in asset_build_package["assets"].items():
            rgc.update_genomes(genome, asset_key,
                               {"path": relative_path.format(**asset_vars)})

        # Write the updated refgenie genome configuration
        rgc.write()

    pm = pypiper.PipelineManager(name="refgenie",
                                 outfolder=outfolder,
                                 args=args)
    tk = pypiper.NGSTk(pm=pm)
    tools = pm.config.tools  # Convenience alias
    index = pm.config.index
    param = pm.config.param

    container = None
    if args.docker:
        # Set up some docker stuff
        if args.volumes:
            volumes = volumes.append(outfolder)
        else:
            volumes = outfolder
        pm.get_container("nsheff/refgenie", volumes)

    for asset_key in args.asset:
        if asset_key in asset_build_packages.keys():
            asset_build_package = asset_build_packages[asset_key]
            _LOGGER.debug(specific_args)
            required_inputs = ", ".join(asset_build_package["required_inputs"])
            _LOGGER.info("Inputs required to build '{}': {}".format(
                asset_key, required_inputs))
            for required_input in asset_build_package["required_inputs"]:
                if not specific_args[required_input]:
                    raise ValueError(
                        "Argument '{}' is required to build asset '{}', but not provided"
                        .format(required_input, asset_key))

            for required_asset in asset_build_package["required_assets"]:
                try:
                    if not rgc.get_asset(args.genome, required_asset):
                        raise ValueError(
                            "Asset '{}' is required to build asset '{}', but not provided"
                            .format(required_asset, asset_key))
                except refgenconf.exceptions.MissingGenomeError:
                    raise ValueError(
                        "Asset '{}' is required to build asset '{}', but not provided"
                        .format(required_asset, asset_key))
            build_asset(args.genome, asset_key, asset_build_package, outfolder,
                        specific_args)
        else:
            _LOGGER.warn(
                "Recipe does not exist for asset '{}'".format(asset_key))

    # if False:
    #     # pm.make_sure_path_exists(outfolder)
    #     conversions = {}
    #     conversions[".2bit"] = "twoBitToFa {INPUT} {OUTPUT}"
    #     conversions[".gz"] = tk.ziptool + " -cd {INPUT} > {OUTPUT}"

    #     # Copy fasta file to genome folder structure
    #     local_raw_fasta = genome + ".fa"
    #     raw_fasta = os.path.join(outfolder, local_raw_fasta)

    #     input_fasta, cmd = copy_or_download_file(args.fasta, outfolder)
    #     pm.run(cmd, input_fasta)

    #     cmd = convert_file(input_fasta, raw_fasta, conversions)
    #     if cmd:
    #         pm.run(cmd, raw_fasta, container=pm.container)

    # # Copy annotation file (if any) to folder structure
    # if args.gtf:
    #     annotation_file_unzipped = os.path.join(outfolder, genome + ".gtf")
    #     annotation_file, cmd = copy_or_download_file(args.gtf, outfolder)
    #     pm.run(cmd, annotation_file)

    #     cmd = convert_file(annotation_file, annotation_file_unzipped, conversions)
    #     pm.run(cmd, annotation_file_unzipped)

    # #   cmd = "cp " + args.gtf + " " + annotation_file
    # #   cmd2 = tk.ziptool + " -d " + annotation_file
    # #   pm.run([cmd, cmd2], annotation_file_unzipped)

    # else:
    #     _LOGGER.debug("* No GTF gene annotations provided. Skipping this step.")

    # # Bowtie indexes
    # if index.bowtie2:
    #     asset_key = "indexed_bowtie2"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.bowtie2build + " " + raw_fasta + " " + os.path.join(folder, genome)
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     # Add index information to rgc
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))

    #     # Write the updated refgenie genome configuration
    #     rgc.write()

    # # Bismark index - bowtie2
    # if index.bismark_bt2:
    #     asset_key = "indexed_bismark_bt2"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.bismark_genome_preparation + " --bowtie2 " + folder
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # # Bismark index - bowtie1
    # if index.bismark_bt1:
    #     asset_key = "indexed_bismark_bt1"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.bismark_genome_preparation + " " + folder
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # # Epilog meth calling
    # if index.epilog:
    #     asset_key = "indexed_epilog"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.epilog_indexer + " -i " + raw_fasta
    #     cmd2 += " -o " + os.path.join(folder, genome + "_" + param.epilog.context + ".tsv")
    #     cmd2 += " -s " + param.epilog.context  # context
    #     cmd2 += " -t"
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # if index.hisat2:
    #     asset_key = "indexed_hisat2"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.hisat2build + " " + raw_fasta + " " + os.path.join(folder, genome)
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # # Kallisto should index transcriptome
    # # So it doesn't make sense to run these at the same time as the others.
    # if index.kallisto:
    #     asset_key = "indexed_kallisto"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd2 = tools.kallisto + " index -i " + os.path.join(folder, genome + "_kallisto_index.idx")
    #     cmd2 += " " + raw_fasta
    #     cmd3 = "touch " + target
    #     pm.run([cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    pm.stop_pipeline()
Esempio n. 18
0
if not args.input or not args.output_parent:
    parser.print_help()
    raise SystemExit

outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))

input_file = args.input[0]
output_plot = os.path.join(outfolder, "line_length_distr_plot.png")
hist_plotter = "plotHist.R"
hist_plotter_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 hist_plotter)
sleep_val = 60 * float(args.sleep) if args.sleep is not None else 0

distr_output = os.path.join(outfolder, "line_lengths_distribution.txt")
pm = pypiper.PipelineManager(name="caravel_demo",
                             outfolder=outfolder,
                             args=args)

pm.timestamp("### File size calculation: ")
file_size_cmd = "wc -c {} | awk '{{print $1}}'".format(input_file)
size_kb = int(pm.checkprint(file_size_cmd, shell=True)) / 1000
pm.report_result("File size", size_kb)

pm.timestamp("### Lines number calculation: ")
num_lines_cmd = "wc -l {input} | sed -E 's/^[[:space:]]+//' | cut -f1 -d' '".format(
    input=input_file)
num_lines = pm.checkprint(num_lines_cmd, shell=True)
pm.report_result("Number of lines", num_lines)

pm.timestamp("### Saving CSV with lines count and file sizes")
outfile = os.path.join(outfolder, args.sample_name + '_results.csv')
### enforce complete user input ###
###################################

if not args.sample_config:
    parser.print_help()
    raise SystemExit()

##################
### Initialize ###
##################

outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))

# Start Pypiper object
# Best practice is to name the pipeline with the name of the script or put the name in the pipeline interface.
pm = pypiper.PipelineManager(name='PROseq', outfolder=outfolder, args=args)

# create NGSTk object
tk = pypiper.NGSTk(pm=pm)

#####################################
### merge input BAMs if necessary ###
#####################################

if len(args.input_file) > 1:
    pm.timestamp("Merging BAM files from replicates: ")

    sample_merged = True

    raw_folder = os.path.join(outfolder, 'merged_raw')
    if not os.path.exists(raw_folder):
Esempio n. 20
0
                                  groups=["pypiper", "common", "looper", "ngs"])

args = parser.parse_args()

bbc = bbconf.BedBaseConf(filepath=bbconf.get_bedbase_cfg(args.bedbase_config))

bed_digest = md5(open(args.bedfile, 'rb').read()).hexdigest()
bedfile_name = os.path.split(args.bedfile)[1]
# need to split twice since there are 2 exts
fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0]
outfolder = os.path.abspath(os.path.join(
    bbc[CFG_PATH_KEY][CFG_BEDSTAT_OUTPUT_KEY], bed_digest))
json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json"))

if not args.just_db_commit:
    pm = pypiper.PipelineManager(name="bedstat-pipeline", outfolder=outfolder,
                                 args=args)
    rscript_path = os.path.join(os.path.dirname(
        os.path.dirname(os.path.abspath(__file__))), "tools", "regionstat.R")
    assert os.path.exists(rscript_path), \
        FileNotFoundError("'{}' script not found".format(rscript_path))
    cmd_vars = dict(rscript=rscript_path, bed=args.bedfile, id=fileid,
                    matrix=args.open_signal_matrix, out=outfolder,
                    genome=args.genome_assembly, digest=bed_digest)
    command = "Rscript {rscript} --bedfile={bed} --fileId={id} " \
              "--openSignalMatrix={matrix} --outputfolder={out} " \
              "--genome={genome} --digest={digest}".format(**cmd_vars)
    pm.run(cmd=command, target=json_file_path)
    pm.stop_pipeline()

# now get the resulting json file and load it into Elasticsearch
# it the file exists, of course
Esempio n. 21
0
def run_pipeline():
    # A good practice is to make an output folder for each sample, housed under
    # the parent output folder, like this:
    outfolder = os.path.abspath(
        os.path.join(args.output_parent, args.sample_name))

    # Create a PipelineManager object and start the pipeline
    pm = pypiper.PipelineManager(name="logmuse-test",
                                 outfolder=outfolder,
                                 args=args)
    pm.info("Getting started!")
    # NGSTk is a "toolkit" that comes with pypiper, providing some functions
    # for dealing with genome sequence data. You can read more about toolkits in the
    # documentation

    files = [str(x) + ".tmp" for x in range(1, 20)]

    pm.run("touch " + " ".join(files), target=files, clean=True)

    # Create a ngstk object
    ngstk = pypiper.NGSTk(pm=pm)

    raw_folder = os.path.join(outfolder, "raw/")
    fastq_folder = os.path.join(outfolder, "fastq/")

    # Merge/Link sample input and Fastq conversion
    # These commands merge (if multiple) or link (if single) input files,
    # then convert (if necessary, for bam, fastq, or gz format) files to fastq.

    # We'll start with a timestamp that will provide a division for this section
    # in the log file
    pm.timestamp("### Merge/link and fastq conversion: ")

    # Now we'll rely on 2 NGSTk functions that can handle inputs of various types
    # and convert these to fastq files.

    local_input_files = ngstk.merge_or_link([args.input, args.input2],
                                            raw_folder, args.sample_name)

    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
        local_input_files, args.sample_name, args.paired_end, fastq_folder)

    # Now we'll use another NGSTk function to grab the file size from the input files
    #
    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))

    # And then count the number of reads in the file

    n_input_files = len(list(filter(bool, local_input_files)))

    raw_reads = sum([
        int(ngstk.count_reads(input_file, args.paired_end))
        for input_file in local_input_files
    ]) / n_input_files

    # Finally, we use the report_result() function to print the output and
    # log the key-value pair in the standard stats.tsv file
    pm.report_result("Raw_reads", str(raw_reads))

    # Cleanup
    pm.stop_pipeline()
parser.add_argument('-y', '--sample_yaml', dest='config_file', help='yaml config file with sample attributes')
parser.add_argument('-dp', '--data_path', dest='data_path', help='path to sequencing data file')
parser.add_argument('-n', '--sample_name', dest='sample_name', help='name of the sample')
parser.add_argument('-r', dest='results_folder', help='path to folder to store results in')
parser.add_argument('-fc', dest='flowcell', help='the flow cell id')

args = parser.parse_args()

##################
### Initialize ###
##################

outfolder = os.path.abspath(os.path.join(args.results_folder,args.sample_name))


pm = pypiper.PipelineManager(name = 'VirSeq', outfolder = outfolder, args = args) # initialize pipeline manager instance

################
### To Fastq ###
################

pm.timestamp('### BAM to FASTQ: ')


trimAdapPair_1_fq = args.sample_name + '_npa_1.fastq.gz'
pathAdapTrimPair_1_fq = os.path.join(outfolder, trimAdapPair_1_fq)
trimAdapPair_2_fq = args.sample_name + '_npa_2.fastq.gz'
pathAdapTrimPair_2_fq = os.path.join(outfolder, trimAdapPair_2_fq)

if(os.path.isfile(pathAdapTrimPair_1_fq) & os.path.isfile(pathAdapTrimPair_2_fq  )):
	pm.timestamp('### FASTQ files already exist!')
Esempio n. 23
0
#!/usr/bin/python2.7
"""Getting Started: A simple sample pipeline built using pypiper."""

# This is a runnable example. You can run it to see what the output
# looks like.

# First, make sure you can import the pypiper package

import os
import pypiper

# Create a PipelineManager instance (don't forget to name it!)
# This starts the pipeline.

pm = pypiper.PipelineManager(name="BASIC", outfolder="pipeline_output/")

# Now just build shell command strings, and use the run function
# to execute them in order. run needs 2 things: a command, and the
# target file you are creating.

# First, generate some random data

# specify target file:
tgt = "pipeline_output/test.out"

# build the command
cmd = "shuf -i 1-500000000 -n 10000000 > " + tgt

# and run with run().
pm.run(cmd, target=tgt)
Esempio n. 24
0
def main(cmdl):

    args = _parse_args(cmdl)

    # Merging
    ################################################################################
    # If 2 input files are given, then these are to be merged.
    # Must be done here to initialize the sample name correctly
    if len(args.input) > 1:
        if args.sample_name == "default":
            args.sample_name = "merged"
    else:
        if args.sample_name == "default":
            # Default sample name is derived from the input file
            args.sample_name = os.path.splitext(os.path.basename(
                args.input[0]))[0]

    # Create a PipelineManager object and start the pipeline
    outfolder = os.path.abspath(
        os.path.join(args.output_parent, args.sample_name))
    pm = pypiper.PipelineManager(name="WGBS",
                                 outfolder=outfolder,
                                 args=args,
                                 version=__version__)

    # Set up a few additional paths not in the config file
    pm.config.tools.scripts_dir = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "tools")
    pm.config.resources.ref_genome_fasta = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        args.genome_assembly + ".fa")
    pm.config.resources.chrom_sizes = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        args.genome_assembly + ".chromSizes")
    pm.config.resources.genomes_split = os.path.join(
        pm.config.resources.resources, "genomes_split")
    try:
        pm.config.resources.bismark_spikein_genome = os.path.join(
            pm.config.resources.genomes, pm.config.resources.spikein_genome,
            "indexed_bismark_bt1")
    except:
        pm.config.resources.bismark_spikein_genome = None

    pm.config.resources.bismark_indexed_genome = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        "indexed_bismark_bt2")

    # Epilog indexes
    pm.config.resources.methpositions = os.path.join(
        pm.config.resources.genomes, args.genome_assembly, "indexed_epilog",
        args.genome_assembly + "_cg.tsv.gz")

    if pm.config.resources.bismark_spikein_genome:
        pm.config.resources.spikein_methpositions = os.path.join(
            pm.config.resources.genomes, pm.config.resources.spikein_genome,
            "indexed_epilog",
            pm.config.resources.spikein_genome + "_index.tsv.gz")

    pm.config.parameters.pipeline_outfolder = outfolder

    print(pm.config)
    tools = pm.config.tools  # Convenience alias
    param = pm.config.parameters
    resources = pm.config.resources

    # Create a ngstk object
    ngstk = pypiper.NGSTk(pm=pm)

    raw_folder = os.path.join(param.pipeline_outfolder, "raw/")
    fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/")

    # Merge/Link sample input and Fastq conversion
    # These commands merge (if multiple) or link (if single) input files,
    # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
    ################################################################################
    pm.timestamp("### Merge/link and fastq conversion: ")

    local_input_files = ngstk.merge_or_link([args.input, args.input2],
                                            raw_folder, args.sample_name)
    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
        local_input_files, args.sample_name, args.paired_end, fastq_folder)
    pm.run(cmd,
           unaligned_fastq,
           follow=ngstk.check_fastq(local_input_files, unaligned_fastq,
                                    args.paired_end))
    pm.clean_add(out_fastq_pre + "*.fastq", conditional=True)

    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
    pm.report_result("Read_type", args.single_or_paired)
    pm.report_result("Genome", args.genome_assembly)

    # Adapter trimming
    ################################################################################
    pm.timestamp("### Adapter trimming: ")

    # We need to detect the quality encoding type of the fastq.
    if isinstance(unaligned_fastq, list):
        example_fq = unaligned_fastq[0]
    else:
        example_fq = unaligned_fastq

    cmd = tools.python + " -u " + os.path.join(
        tools.scripts_dir, "detect_quality_code.py") + " -f " + example_fq
    encoding_string = pm.checkprint(cmd)
    if encoding_string.find("phred33") != -1:
        encoding = "phred33"
    elif encoding_string.find("phred64") != -1:
        encoding = "phred64"
    else:
        raise Exception("Unknown quality encoding type: " + encoding_string)

    trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq"
    trimmed_fastq_R2 = out_fastq_pre + "_R2_trimmed.fq"

    cmd = tools.java + " -Xmx" + str(pm.mem) + " -jar " + tools.trimmomatic
    if args.paired_end:
        cmd += " PE"
    else:
        cmd += " SE"
    cmd += " -" + encoding
    cmd += " -threads " + str(pm.cores) + " "
    #cmd += " -trimlog " + os.path.join(fastq_folder, "trimlog.log") + " "
    if args.paired_end:
        cmd += out_fastq_pre + "_R1.fastq "
        cmd += out_fastq_pre + "_R2.fastq "
        cmd += out_fastq_pre + "_R1_trimmed.fq "
        cmd += out_fastq_pre + "_R1_unpaired.fq "
        cmd += out_fastq_pre + "_R2_trimmed.fq "
        cmd += out_fastq_pre + "_R2_unpaired.fq "
    else:
        cmd += out_fastq_pre + "_R1.fastq "
        cmd += out_fastq_pre + "_R1_trimmed.fq "
    cmd += " " + param.trimmomatic.trimsteps
    cmd += " ILLUMINACLIP:" + resources.adapter_file + param.trimmomatic.illuminaclip

    pm.run(cmd,
           trimmed_fastq,
           follow=ngstk.check_trim(trimmed_fastq,
                                   args.paired_end,
                                   trimmed_fastq_R2,
                                   fastqc_folder=os.path.join(
                                       param.pipeline_outfolder, "fastqc/")))

    pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True)
    pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True)
    pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True)
    pm.clean_add(fastq_folder, conditional=True)

    # WGBS alignment with bismark.
    ################################################################################
    pm.timestamp("### Bismark alignment: ")
    # Bismark will start multiple instances of bowtie, so we have to split
    # the alotted cores among the instances. Otherwise we will use 2x or 4x the number
    # of cores that we aresupposed to. It will start 2 threads in
    # normal mode, and 4 in --non-directional mode.

    if param.bismark.nondirectional:
        bismark_bowtie_threads = 4
    else:
        bismark_bowtie_threads = 2

    bismark_cores = int(pm.cores) // bismark_bowtie_threads

    if int(pm.cores) % bismark_bowtie_threads != 0:
        print("inefficient core request; make divisible by " +
              str(bismark_bowtie_threads))

    bismark_folder = os.path.join(param.pipeline_outfolder,
                                  "bismark_" + args.genome_assembly)
    ngstk.make_sure_path_exists(bismark_folder)
    bismark_temp = os.path.join(bismark_folder, "bismark_temp")
    ngstk.make_sure_path_exists(bismark_temp)

    if args.paired_end:
        out_bismark = os.path.join(bismark_folder,
                                   args.sample_name + "_pe.bam")
    else:
        out_bismark = os.path.join(bismark_folder, args.sample_name + ".bam")

    cmd = tools.bismark + " " + resources.bismark_indexed_genome + " "
    if args.paired_end:
        cmd += " --1 " + out_fastq_pre + "_R1_trimmed.fq"
        cmd += " --2 " + out_fastq_pre + "_R2_trimmed.fq"
    else:
        cmd += out_fastq_pre + "_R1_trimmed.fq"
    cmd += " --bam --unmapped"
    # Bowtie may be specified in raw form to indicate presence on path.
    if tools.bowtie2 != "bowtie2":
        cmd += " --path_to_bowtie " + tools.bowtie2
    cmd += " --bowtie2"
    cmd += " --temp_dir " + bismark_temp
    cmd += " --output_dir " + bismark_folder
    if args.paired_end:
        cmd += " --minins 0"
        cmd += " --maxins " + str(param.bismark.maxins)
    cmd += " -p " + str(bismark_cores)  # Number of processors
    cmd += " --basename=" + args.sample_name

    # By default, BS-seq libraries are directional, but this can be turned off
    # in bismark for non-directional protocols
    if param.bismark.nondirectional:
        cmd += " --non_directional"

    def check_bismark():
        ar = ngstk.count_mapped_reads(out_bismark, args.paired_end)
        pm.report_result("Aligned_reads", ar)
        rr = float(pm.get_stat("Raw_reads"))
        tr = float(pm.get_stat("Trimmed_reads"))
        pm.report_result("Alignment_rate",
                         round(float(ar) * 100 / float(tr), 2))
        pm.report_result("Total_efficiency",
                         round(float(ar) * 100 / float(rr), 2))

        mr = ngstk.count_multimapping_reads(out_bismark, args.paired_end)
        pm.report_result("Multimap_reads", mr)
        pm.report_result("Multimap_rate",
                         round(float(mr) * 100 / float(tr), 2))

    pm.run(cmd, out_bismark, follow=check_bismark)

    # Secondary single mode:
    # align unmapped in single end mode?
    if args.paired_end and args.single2:
        pm.timestamp("### Bismark secondary single-end alignment: ")
        out_bismark_se = []
        for read_n in ["1", "2"]:  # Align each read in single end mode
            read_string = "R" + str(read_n)
            bismark2_folder = os.path.join(bismark_folder,
                                           "se" + str(read_string))
            ngstk.make_sure_path_exists(bismark2_folder)
            bismark2_temp = os.path.join(bismark2_folder, "bismark2_temp")
            ngstk.make_sure_path_exists(bismark2_temp)
            out_bismark2 = os.path.join(
                bismark2_folder, args.sample_name + read_string + ".bam")

            unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name)

            cmd = tools.bismark + " " + resources.bismark_indexed_genome + " "
            cmd += unmapped_reads_pre + "_unmapped_reads_" + str(
                read_n) + ".fq"
            cmd += " --bam --unmapped"
            # Bowtie may be specified in raw form to indicate presence on path.
            if tools.bowtie2 != "bowtie2":
                cmd += " --path_to_bowtie " + tools.bowtie2
            cmd += " --bowtie2"
            cmd += " --temp_dir " + bismark2_temp
            cmd += " --output_dir " + bismark2_folder
            cmd += " --basename=" + args.sample_name + read_string
            cmd += " -p " + str(bismark_cores)
            if param.bismark.nondirectional:
                cmd += " --non_directional"

            pm.run(cmd, out_bismark2)
            out_bismark_se.append(out_bismark2)

        # Now merge, sort, and analyze the single-end data
        merged_bismark = args.sample_name + "_SEmerged.bam"
        output_merge = os.path.join(bismark_folder, merged_bismark)
        cmd = ngstk.merge_bams(out_bismark_se,
                               output_merge,
                               in_sorted="FALSE",
                               tmp_dir=resources.tmp_dir)

        pm.run(cmd, output_merge)
        # Sort by read name
        sorted_bismark = args.sample_name + "_SEsorted.bam"
        output_sort = os.path.join(bismark_folder, sorted_bismark)

        cmd = tools.samtools + " sort -n -o " + output_merge + " " + output_sort
        pm.run(cmd, output_sort)

        cmd = tools.python + " -u " + os.path.join(tools.scripts_dir,
                                                   "rematch_pairs.py")
        cmd += " -i " + output_sort

        pm.run(cmd, lock_name="rematch")

    pm.timestamp("### PCR duplicate removal: ")
    # Bismark's deduplication forces output naming, how annoying.
    #out_dedup = os.path.join(bismark_folder, args.sample_name + "_pe.deduplicated.bam")
    out_dedup = re.sub(r'.bam$', '.deduplicated.bam', out_bismark)
    cmd, out_dedup = get_dedup_bismark_cmd(paired=args.paired_end,
                                           infile=out_bismark,
                                           prog=tools.deduplicate_bismark)
    with FolderContext(bismark_folder):
        pm.run(cmd,
               out_dedup,
               follow=lambda: pm.report_result(
                   "Deduplicated_reads",
                   ngstk.count_reads(out_dedup, args.paired_end)))
    if not os.path.isfile(out_dedup):
        pm.fail_pipeline(
            IOError("Missing deduplication target: {}".format(out_dedup)))

    pm.timestamp("### Aligned read filtering: ")

    # convert bam file into sam file and sort again to
    # compensate for a sorting issue of "deduplicate_bismark"
    sam_temp = os.path.join(bismark_folder, "sam_temp")
    ngstk.make_sure_path_exists(sam_temp)
    out_sam = os.path.join(bismark_folder,
                           args.sample_name + ".aln.deduplicated.sam")
    #Is this an old version of samtools?
    #cmd = tools.samtools + " sort -n -o " + out_dedup + " " + out_dedup.replace(".bam", "_sorted") + " | " + tools.samtools + " view -h - >" + out_sam
    #cmd = tools.samtools + " sort -n " + out_dedup + " " + " | " + tools.samtools + " view -h - >" + out_sam
    cmd = tools.samtools + " sort -n " + out_dedup + " -o " + out_sam
    pm.run(cmd, out_sam, shell=True)

    #sorted file same size as presorted?
    #pm.report_result("Filtered_reads", ngstk.count_reads(out_sam_filter, args.paired_end)) = ngstk.count_reads(out_sam, args.paired_end)
    #if sorted_reads != deduplicated_reads:
    #	raise Exception("Sorted size doesn't match deduplicated size.")

    out_sam_filter = os.path.join(bismark_folder,
                                  args.sample_name + ".aln.dedup.filt.sam")

    headerLines = subprocess.check_output(tools.samtools + " view -SH " +
                                          out_sam + "| wc -l",
                                          shell=True).strip()
    cmd = tools.python + " " + os.path.join(
        tools.scripts_dir, "bisulfiteReadFiltering_forRNA.py")
    cmd += " --infile=" + out_sam
    cmd += " --outfile=" + out_sam_filter
    cmd += " --skipHeaderLines=" + headerLines
    cmd += " --genome=" + args.genome_assembly
    cmd += " --genomeDir=" + resources.genomes
    cmd += " --minNonCpgSites=3"
    cmd += " --minConversionRate=0.9"
    if args.paired_end:
        cmd = cmd + " --pairedEnd"

    pm.run(cmd,
           out_sam_filter,
           follow=lambda: pm.report_result(
               "Filtered_reads",
               ngstk.count_reads(out_sam_filter, args.paired_end)))

    # Clean up all intermediates
    pm.clean_add(out_bismark)  # initial mapped bam file
    pm.clean_add(os.path.join(bismark_folder, "*.fastq"))
    pm.clean_add(os.path.join(bismark_folder, "*.fq"))
    pm.clean_add(out_dedup)  # deduplicated bam file
    pm.clean_add(out_sam)  # dedup conversion to sam
    pm.clean_add(out_sam_filter)  # after filtering

    # Epilog analysis
    ################################################################################

    # Create the program specification, in scope both for ordinary and spike-in.
    if args.epilog:
        try:
            epilog_prog_spec = ProgSpec(jar=tools.epilog,
                                        memory=pm.mem,
                                        cores=pm.cores)
        except MissingEpilogError as e:
            print("ERROR: {} --  skipping epilog".format(str(e)))
            epilog_prog_spec = None
    else:
        epilog_prog_spec = None

    if epilog_prog_spec:

        # Sort and index the deduplicated alignments.
        out_dedup_sorted = re.sub(r'.bam$', "_sort.bam", out_dedup)
        cmd2 = tools.samtools + " sort -@ " + str(
            pm.cores) + " -o " + out_dedup_sorted + " " + out_dedup
        cmd3 = tools.samtools + " index " + out_dedup_sorted
        pm.run([cmd2, cmd3], out_dedup_sorted + ".bai")

        # Separate output subfolder for epilog
        epilog_output_dir = os.path.join(param.pipeline_outfolder,
                                         "epilog_" + args.genome_assembly)
        ngstk.make_sure_path_exists(epilog_output_dir)

        pm.timestamp("### Epilog Methcalling: ")
        run_main_epi_pipe(pm,
                          epiconf=param.epilog,
                          prog_spec=epilog_prog_spec,
                          readsfile=out_dedup_sorted,
                          sitesfile=resources.methpositions,
                          outdir=epilog_output_dir,
                          rrbs_fill=0)
        pm.timestamp("### COMPLETE: epilog")

    # Methylation extractor
    ################################################################################
    # REMARK NS:
    # Bismark methylation extractor produces various outpus, but unfortunately none
    # are great. The default "coverage" (.bismark.cov) file is thus:
    # chr	start	stop	meth	methylated	unmethylated
    # chr17	4890653	4890653	100	1	0
    # chr17	5334751	5334751	100	1	0
    # This output lacks strand information, so you don't know if the coordinate is
    # pointing to a C or G on the + strand unless you look it up in the reference genome.
    # The "cytosine_report" file has all the info, but includes an entry for every
    # CpG, covered or not:
    # chr17	3000204	+	0	0	CG	CGT
    # chr17	3000205	-	0	0	CG	CGA
    # chr17	4890653	-	1	0	CG	CGA
    # Solution: Use the cytosine_report file, and filter out any uncovered reads.

    pm.timestamp("### Methylation calling (bismark extractor): ")

    extract_dir = os.path.join(bismark_folder, "extractor")
    ngstk.make_sure_path_exists(extract_dir)
    out_extractor = os.path.join(
        extract_dir,
        re.sub(r'.sam$', '.bismark.cov', os.path.basename(out_sam_filter)))
    out_cpg_report = re.sub(r'.bismark.cov$', '.CpG_report.txt.gz',
                            out_extractor)

    cmd = tools.bismark_methylation_extractor
    if args.paired_end:
        cmd += " --paired-end --no_overlap"
    else:
        cmd += " --single-end"
    cmd += " --report"
    cmd += " --bedGraph"
    cmd += " --merge_non_CpG"
    cmd += " --cytosine_report"
    cmd += " --genome_folder " + resources.bismark_indexed_genome
    cmd += " --gzip"
    cmd += " --output " + extract_dir
    cmd += " " + out_sam_filter

    pm.run(cmd, out_cpg_report)

    # TODO: make these boolean flags options to the pipeline
    keep_bismark_report = True
    keep_non_standard_chromosomes = False
    adjust_minus_strand = True

    # prepare outputs:
    out_cpg_report_filt = re.sub(r'.CpG_report.txt.gz$',
                                 '.CpG_report_filt.txt', out_cpg_report)
    out_cpg_report_filt_cov = re.sub(r'.CpG_report.txt.gz$',
                                     '.CpG_report_filt.cov', out_cpg_report)

    # remove uncovered regions:
    # Update to Bismark version 17 now gzips this output.
    cmd = ngstk.ziptool + " -c -d"
    cmd += " " + out_cpg_report
    cmd += " | awk '{ if ($4+$5 > 0) print; }'"
    cmd += " > " + out_cpg_report_filt
    pm.run(cmd, out_cpg_report_filt, shell=True)

    # convert the bismark report to the simpler coverage format and adjust the coordinates
    # of CpG's on the reverse strand while doing so (by substracting 1 from the start):
    if os.path.getsize(out_cpg_report_filt) == 0:
        print("Methylation report () is empty -- skipping conversion".format(
            out_cpg_report_filt))
    else:
        cmd = tools.Rscript + " " + os.path.join(
            tools.scripts_dir, "convertBismarkReport.R"
        )  # disable coverage filter, because we have already used `awk` to achieve this result
        cmd += " --formats=cov,min"
        cmd += " --noCovFilter"
        if keep_non_standard_chromosomes:
            cmd += " --noChromFilter"
        if not adjust_minus_strand:
            cmd += " --noAdjustMinusStrand"
        cmd += " -i " + out_cpg_report_filt
        pm.run(cmd, out_cpg_report_filt_cov, nofail=True)

    # tidy up:
    if not keep_bismark_report:
        pm.clean_add(out_cpg_report_filt)

    # Make bigwig
    ################################################################################
    pm.timestamp("### Make bigwig: ")

    bedGraph = re.sub(".bismark.cov$", ".bedGraph", out_extractor)
    sort_bedGraph = re.sub(".bedGraph$", ".sort.bedGraph", bedGraph)
    out_bigwig = re.sub(".bedGraph$", ".bw", bedGraph)
    cmd1 = ngstk.ziptool + " -c -d"
    cmd1 += " " + bedGraph
    cmd1 += " | sed '1d' " + " | LC_COLLATE=C sort -k1,1 -k2,2n - " + " > " + sort_bedGraph
    cmd2 = tools.bedGraphToBigWig + " " + sort_bedGraph + " " + resources.chrom_sizes
    cmd2 += " " + out_bigwig

    pm.run([cmd1, cmd2], out_bigwig)

    # Spike-in alignment
    ################################################################################
    # currently using bowtie1 instead of bowtie2
    if resources.bismark_spikein_genome:
        pm.timestamp("### Bismark spike-in alignment: ")
        spikein_folder = os.path.join(param.pipeline_outfolder,
                                      "bismark_spikein")
        ngstk.make_sure_path_exists(spikein_folder)
        spikein_temp = os.path.join(spikein_folder, "bismark_temp")
        ngstk.make_sure_path_exists(spikein_temp)
        out_spikein_base = args.sample_name + ".spikein.aln"

        #out_spikein = spikein_folder + args.sample_name + "_R1_trimmed.fastq_unmapped_reads_1.fq_bismark_pe.bam"

        unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name)
        if args.paired_end:
            out_spikein = os.path.join(spikein_folder,
                                       out_spikein_base + "_pe.bam")
        else:
            out_spikein = os.path.join(spikein_folder,
                                       out_spikein_base + ".bam")
        cmd = tools.bismark + " " + resources.bismark_spikein_genome + " "
        if args.paired_end:
            cmd += " --1 " + unmapped_reads_pre + "_unmapped_reads_1.fq"
            cmd += " --2 " + unmapped_reads_pre + "_unmapped_reads_2.fq"
        else:
            cmd += unmapped_reads_pre + "_unmapped_reads.fq"
        cmd += " --bam --unmapped"
        # Bowtie may be specified in raw form to indicate presence on path.
        if tools.bowtie1 != "bowtie":
            cmd += " --path_to_bowtie " + tools.bowtie1
        #cmd += " --bowtie2"
        cmd += " --temp_dir " + spikein_temp
        cmd += " --output_dir " + spikein_folder
        if args.paired_end:
            cmd += " --minins 0"
            cmd += " --maxins " + str(param.bismark.maxins)
        cmd += " --basename=" + out_spikein_base
        if param.bismark.nondirectional:
            cmd += " --non_directional"

        pm.run(cmd, out_spikein, nofail=True)
        # Clean up the unmapped file which is copied from the parent
        # bismark folder to here:
        pm.clean_add(os.path.join(spikein_folder, "*.fq"), conditional=False)
        pm.clean_add(spikein_temp)

        pm.timestamp("### PCR duplicate removal (Spike-in): ")
        # Bismark's deduplication forces output naming, how annoying.
        #out_spikein_dedup = spikein_folder + args.sample_name + ".spikein.aln.deduplicated.bam"
        cmd, out_spikein_dedup = get_dedup_bismark_cmd(
            paired=args.paired_end,
            infile=out_spikein,
            prog=tools.deduplicate_bismark)
        out_spikein_sorted = re.sub(r'.deduplicated.bam$',
                                    '.deduplicated.sorted.bam',
                                    out_spikein_dedup)
        cmd2 = tools.samtools + " sort " + out_spikein_dedup + " -o " + out_spikein_sorted
        cmd3 = tools.samtools + " index " + out_spikein_sorted
        cmd4 = "rm " + out_spikein_dedup
        pm.run([cmd, cmd2, cmd3, cmd4],
               out_spikein_sorted + ".bai",
               nofail=True)

        # Spike-in methylation calling
        ################################################################################
        pm.timestamp("### Methylation calling (testxmz) Spike-in: ")
        spike_chroms = ngstk.get_chrs_from_bam(out_spikein_sorted)

        for chrom in spike_chroms:
            cmd1 = tools.python + " -u " + os.path.join(
                tools.scripts_dir, "testxmz.py")
            cmd1 += " " + out_spikein_sorted + " " + chrom
            cmd1 += " >> " + pm.pipeline_stats_file
            pm.callprint(cmd1, nofail=True)

        # spike in conversion efficiency calculation with epilog
        if epilog_prog_spec:
            ngstk.make_sure_path_exists(spikein_folder)
            pm.timestamp("### Spike-in Epilog Methcalling: ")
            spikein_epiconf = copy.deepcopy(param.epilog)
            spikein_epiconf.context = "C"
            spikein_epiconf.no_epi_stats = True  # Always skip stats for spike-in.
            try:
                run_main_epi_pipe(pm,
                                  epiconf=spikein_epiconf,
                                  prog_spec=epilog_prog_spec,
                                  readsfile=out_spikein_sorted,
                                  sitesfile=resources.spikein_methpositions,
                                  outdir=spikein_folder,
                                  rrbs_fill=0)
            except Exception as e:
                print("WARNING -- Could not run epilog -- {}".format(e))
        """
		epilog_spike_outfile=os.path.join(
				spikein_folder, args.sample_name + "_epilog.bed")
		epilog_spike_summary_file=os.path.join(
				spikein_folder, args.sample_name + "_epilog_summary.bed")
		
		cmd = tools.epilog
		cmd += " call"
		cmd += " --infile=" + out_spikein_sorted  # absolute path to the bsmap aligned bam
		cmd += " --positions=" + resources.spikein_methpositions
		cmd += " --outfile=" + epilog_spike_outfile
		cmd += " --summary=" + epilog_spike_summary_file
		cmd += " --cores=" + str(pm.cores)
		cmd += " --qual-threshold=30"
		cmd += " --read-length-threshold=30"
		cmd += " --wgbs"    # No RRBS "fill-in"
		
		pm.run(cmd, epilog_spike_outfile, nofail=True)
		
		# Now parse some results for pypiper result reporting.
	
		for chrom in spike_chroms:
			cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py")
			cmd += " -i " + os.path.join(spikein_folder, epilog_spike_summary_file)
			cmd += " -r context=C chr=" + chrom
	
			cmd_total = cmd + " -c " + "total"
			x = pm.checkprint(cmd_total, shell=True)
			pm.report_result(chrom+'_count_EL', x)
			cmd_rate = cmd + " -c " + "rate"
			x = pm.checkprint(cmd_rate, shell=True)
			pm.report_result(chrom+'_meth_EL', x)
		"""

    # Final sorting and indexing
    ################################################################################
    # create sorted and indexed BAM files for visualization and analysis
    pm.timestamp("### Final sorting and indexing: ")

    #out_header = bismark_folder + args.sample_name + ".reheader.bam"
    out_final = os.path.join(bismark_folder, args.sample_name + ".final.bam")
    # temp_folder = os.path.join(bismark_folder, "tmp")

    # # Sort
    # cmd = tools.java + " -Xmx" + str(pm.mem)
    # # This sort can run out of temp space on big jobs; this puts the temp to a
    # # local spot.
    # cmd += " -Djava.io.tmpdir=" + str(temp_folder)
    # cmd += " -jar " + tools.picard + " SortSam"
    # cmd += " I=" + out_sam_filter
    # cmd += " O=" + out_final
    # cmd += " SORT_ORDER=coordinate"
    # cmd += " VALIDATION_STRINGENCY=SILENT"
    # cmd += " CREATE_INDEX=true"
    # pm.run(cmd, out_final, lock_name="final_sorting")

    cmd = tools.samtools + " sort -@ " + str(
        pm.cores) + " " + out_sam_filter + " -o " + out_final
    cmd2 = tools.samtools + " index " + out_final
    pm.run([cmd, cmd2], out_final + ".bai")

    # Cleanup
    ################################################################################
    # remove temporary folders
    pm.clean_add(bismark_temp)
    pm.clean_add(sam_temp)
    pm.stop_pipeline()
Esempio n. 25
0
                    help='Target wigsum for track normalisation')

args = parser.parse_args()

if args.single_or_paired == "paired":
    args.paired_end = True
else:
    args.paired_end = False

if not args.input:
    parser.print_help()
    raise SystemExit

# Initialize
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
pm = pypiper.PipelineManager(name="rnaESAT", outfolder=outfolder, args=args)

# Tools
pm.config.tools.scripts_dir = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "tools")

# Resources
pm.config.resources.ref_genome = os.path.join(pm.config.resources.genomes,
                                              args.genome_assembly)
pm.config.resources.ref_genome_fasta = os.path.join(
    pm.config.resources.genomes, args.genome_assembly,
    args.genome_assembly + ".fa")
pm.config.resources.chrom_sizes = os.path.join(
    pm.config.resources.genomes, args.genome_assembly,
    args.genome_assembly + ".chromSizes")
pm.config.resources.bowtie_indexed_genome = os.path.join(
Esempio n. 26
0
                    action='store_true',
                    default=False,
                    dest="rmdup",
                    help="bam files already have duplicates removed")
parser.add_argument('-narrowpeak',
                    "--narrowpeak-input",
                    action='store_true',
                    default=False,
                    dest="narrowpeak",
                    help="starting with narrowpeak files")
args = parser.parse_args()

# Initialize
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
pm = pypiper.PipelineManager(name="FindNormPeaks",
                             outfolder=outfolder,
                             args=args)
ngstk = pypiper.NGSTk(pm=pm)

# Convenience alias
tools = pm.config.tools
param = pm.config.parameters
res = pm.config.resources

# Set up reference resource according to genome prefix.
gfolder = os.path.join(res.genomes, args.genome_assembly)

output = outfolder
param.outfolder = outfolder

################################################################################
Esempio n. 27
0
    help='ERCC mix. If False no ERCC analysis will be performed.')

args = parser.parse_args()

if args.single_or_paired == "paired":
    args.paired_end = True
else:
    args.paired_end = False

if not args.input:
    parser.print_help()
    raise SystemExit

# Initialize
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
pm = pypiper.PipelineManager(name="rnaNucSeq", outfolder=outfolder, args=args)

# Tools
# pm.config.tools.scripts_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tools")

# Resources
# pm.config.resources.ref_genome = os.path.join(pm.config.resources.genomes, args.genome_assembly)
# pm.config.resources.ref_genome_fasta = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa")
# pm.config.resources.chrom_sizes = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes")

# Output
pm.config.parameters.pipeline_outfolder = outfolder

ngstk = pypiper.NGSTk(pm=pm)

tools = pm.config.tools
#' Part of the looper setup. We add two additional arguments to the parser, one is the sample id of
#' the currently processed sample and the second is the path to the bam file containing the mapped
#' reads (preferably with bsmap). These two arguments are passed through
#' config/pipeline_interface.yaml to map column names in the sample anntotation sheet to the name of
#' the argument here.

parser = argparse.ArgumentParser(description="Pipeline")
parser.add_argument("--sample_id", "-o", help="id of sample to be analyzed")
parser.add_argument("--bam_name",
                    help="path to bam file of sample to be analyzed")
parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "looper"])
args = parser.parse_args()

manager = pypiper.PipelineManager(name="HETEROGENEITY",
                                  outfolder=args.output_parent,
                                  args=args)

pipe_folder = os.path.dirname(sys.argv[0]) + "/"

#####################################################################################################
#' PART I: Preprocessing
#####################################################################################################
if not os.path.exists(args.output_parent + "/" + args.sample_id):
    os.makedirs(args.output_parent + "/" + args.sample_id)

sample_folder = args.output_parent + "/" + args.sample_id + "/"

#' Use script to convert bsmap style bam file to bismark style
bismark_bam = sample_folder + "bismark_bam.bam"
cmd = " ".join([
Esempio n. 29
0
    raise SystemExit

if args.single_or_paired == "paired":
    args.paired_end = True
else:
    args.paired_end = False

# args for `output_parent` and `sample_name` were added by the standard 
# `add_pypiper_args` function. 
# A good practice is to make an output folder for each sample, housed under
# the parent output folder, like this:
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))

# Create a PipelineManager object and start the pipeline
pm = pypiper.PipelineManager(name="count",
                             outfolder=outfolder, 
                             args=args)

# NGSTk is a "toolkit" that comes with pypiper, providing some functions
# for dealing with genome sequence data. You can read more about toolkits in the
# documentation

# Create a ngstk object
ngstk = pypiper.NGSTk(pm=pm)

raw_folder = os.path.join(outfolder, "raw/")
fastq_folder = os.path.join(outfolder, "fastq/")

# Merge/Link sample input and Fastq conversion
# These commands merge (if multiple) or link (if single) input files,
# then convert (if necessary, for bam, fastq, or gz format) files to fastq.
Esempio n. 30
0
# Must be done here to initialize the sample name correctly
# This is now deprecated (there is no default sample name implemented)
#merge = False
#if len(args.input) > 1:
#	merge = True
#	if args.sample_name == "default":
#		args.sample_name = "merged"
#else:
#	if args.sample_name == "default":
#		# Default sample name is derived from the input file
#		args.sample_name = os.path.splitext(os.path.basename(args.input[0]))[0]

# Create a PipelineManager object and start the pipeline
pm = pypiper.PipelineManager(name="RRBS",
                             outfolder=os.path.abspath(
                                 os.path.join(args.output_parent,
                                              args.sample_name)),
                             args=args)

# Set up a few additional paths not in the config file
pm.config.tools.scripts_dir = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "tools")
pm.config.resources.ref_genome_fasta = os.path.join(
    pm.config.resources.genomes, args.genome_assembly,
    args.genome_assembly + ".fa")
pm.config.resources.chrom_sizes = os.path.join(
    pm.config.resources.genomes, args.genome_assembly,
    args.genome_assembly + ".chromSizes")
pm.config.resources.genomes_split = os.path.join(pm.config.resources.resources,
                                                 "genomes_split")
pm.config.resources.bismark_spikein_genome = os.path.join(