Example #1
0
def find_samples(path, sample=None, pattern = "-bcbb-config.yaml$", only_failed=False, **kw):
    """Find bcbb config files in a path.

    :param path: path to search in
    :param sample: a specific sample, or a file consisting of -bcbb-config.yaml files
    :param pattern: pattern to search for

    :returns: list of file names
    """
    def bcbb_yaml_filter(f):
        return re.search(pattern, f) != None
    flist = []
    if sample:
        if os.path.exists(sample):
            with open(sample) as fh:
                samplelist = fh.readlines()
            flist = [x.rstrip() for x in samplelist if re.search(pattern, x)]
            if len(flist) == 0:
                flist = [os.path.join(path, x.rstrip()) for x in samplelist if len(x) > 1]
                # Make sure there actually is a config file in path
                flist = list(chain.from_iterable([filtered_walk(x, bcbb_yaml_filter, exclude_dirs=kw.get("exclude_dirs", None), include_dirs=kw.get("include_dirs", None)) for x in flist]))
            if len(flist) == 0:
                return flist
        else:
            pattern = "{}{}".format(sample, pattern)
    if not flist:
        flist = filtered_walk(path, bcbb_yaml_filter, exclude_dirs=kw.get("exclude_dirs", None), include_dirs=kw.get("include_dirs", None))
    if only_failed:
        status = {x:_sample_status(x) for x in flist}
        flist = [x for x in flist if _sample_status(x)=="FAIL"]
    if len(flist) == 0 and sample:
        LOG.info("No such sample {}".format(sample))
    return [os.path.abspath(f) for f in flist]
Example #2
0
    def purge_alignments(self):
        """Cleanup sam and bam files. In some cases, sam files
        persist. If the corresponding bam file exists, replace the sam
        file contents with a message that the file has been removed to
        save space.
        """
        pattern = ".sam$"
        def purge_filter(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter)
        if len(flist) == 0:
            self.app.log.info("No sam files found")
            return
        if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} sam files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            return
        for f in flist:
            self.app.log.info("Purging sam file {}".format(f))
            self.app.cmd.safe_unlink(f)
            if os.path.exists(f.replace(".sam", ".bam")):
                self.app.cmd.write(f, "File removed to save disk space: SAM converted to BAM")

        ## Find bam files in alignments subfolders
        pattern = ".bam$"
        flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), purge_filter, include_dirs=["alignments"])
        for f in flist:
            f_tgt = [f.replace(".bam", "-sort.bam"), os.path.join(os.path.dirname(os.path.dirname(f)),os.path.basename(f) )]
            for tgt in f_tgt:
                if os.path.exists(tgt):
                    self.app.log.info("Purging bam file {}".format(f))
                    self.app.cmd.safe_unlink(f)
                    self.app.cmd.write(f, "File removed to save disk space: Moved to {}".format(os.path.abspath(tgt)))
Example #3
0
 def remove_finished(self):
     if not self._check_pargs(["project"]):
         return
     # Don't filter out files
     def filter_fn(f):
         return True
     slist = os.listdir(os.path.join(self._meta.root_path, self._meta.path_id))
     for s in slist:
         spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
         if not os.path.isdir(spath):
             continue
         if not os.path.exists(os.path.join(spath, FINISHED_FILE)):
             self.app.log.info("Sample {} not finished; skipping".format(s))
             continue
         flist = filtered_walk(spath, filter_fn)
         dlist = filtered_walk(spath, filter_fn, get_dirs=True)
         if os.path.exists(os.path.join(spath, REMOVED_FILE)):
             self.app.log.info("Sample {} already removed; skipping".format(s))
             continue
         if len(flist) > 0 and not query_yes_no("Will remove directory {} containing {} files; continue?".format(s, len(flist)), force=self.pargs.force):
             continue
         self.app.log.info("Removing {} files from {}".format(len(flist), spath))            
         for f in flist:
             if f == os.path.join(spath, FINISHED_FILE):
                 continue
             self.app.cmd.safe_unlink(f)
         self.app.log.info("Removing {} directories from {}".format(len(dlist), spath))
         for d in sorted(dlist, reverse=True):
             self.app.cmd.safe_rmdir(d)
         if not self.pargs.dry_run:
             with open(os.path.join(spath, REMOVED_FILE), "w") as fh:
                 t_utc = utc_time()
                 fh.write(t_utc)
Example #4
0
def find_samples(path, sample=None, pattern = "-bcbb-config.yaml$", only_failed=False, **kw):
    """Find bcbb config files in a path.

    :param path: path to search in
    :param sample: a specific sample, or a file consisting of -bcbb-config.yaml files
    :param pattern: pattern to search for

    :returns: list of file names
    """
    def bcbb_yaml_filter(f):
        return re.search(pattern, f) != None
    flist = []
    if sample:
        if os.path.exists(sample):
            with open(sample) as fh:
                samplelist = fh.readlines()
            flist = [x.rstrip() for x in samplelist if re.search(pattern, x)]
            if len(flist) == 0:
                flist = [os.path.join(path, x.rstrip()) for x in samplelist if len(x) > 1]
                # Make sure there actually is a config file in path
                flist = list(chain.from_iterable([filtered_walk(x, bcbb_yaml_filter, exclude_dirs=kw.get("exclude_dirs", None), include_dirs=kw.get("include_dirs", None)) for x in flist]))
            if len(flist) == 0:
                return flist
        else:
            pattern = "{}{}".format(sample, pattern)
    if not flist:
        flist = filtered_walk(path, bcbb_yaml_filter, exclude_dirs=kw.get("exclude_dirs", None), include_dirs=kw.get("include_dirs", None))
    if only_failed:
        status = {x:_sample_status(x) for x in flist}
        flist = [x for x in flist if _sample_status(x)=="FAIL"]
    if len(flist) == 0 and sample:
        LOG.info("No such sample {}".format(sample))
    return [os.path.abspath(f) for f in flist]
Example #5
0
def remove_files(f, **kw):
    ## Remove old files if requested
    keep_files = [
        "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$",
        "-bcbb-config.yaml.bak$", "-bcbb-command.txt$",
        "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$",
        "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$",
        "JOBID", "PID"
    ]
    pattern = "|".join(keep_files)

    def remove_filter_fn(f):
        return re.search(pattern, f) == None

    workdir = os.path.dirname(f)
    remove_files = filtered_walk(workdir, remove_filter_fn)
    remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
    if len(remove_files) == 0:
        pass
    if len(remove_files) > 0 and query_yes_no(
            "Going to remove {} files and {} directories... Are you sure you want to continue?"
            .format(len(remove_files), len(remove_dirs)),
            force=kw['force']):
        [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files]
        ## Sort directories by length so we don't accidentally try to remove a non-empty dir
        [
            dry_rmdir(x, dry_run=kw['dry_run'])
            for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)
        ]
Example #6
0
 def test_filtered_walk_get_dirs(self):
     """Perform a filtered walk of data dir, getting dirs"""
     flist = filtered_walk("data",
                           filter_fn=self.filter_fn,
                           include_dirs=["nophix"],
                           exclude_dirs=["fastqc"],
                           get_dirs=True)
     self.assertEqual(set(flist), set([]))
     flist = filtered_walk("data",
                           filter_fn=self.filter_fn,
                           include_dirs=["nophix"],
                           exclude_dirs=["fastqc"],
                           get_dirs=False)
     self.assertEqual(set(flist), set(['data/nophix/file1.txt']))
Example #7
0
 def hs_metrics(self):
     if not self._check_pargs(["project", "region_file"]):
         return
     if not self.pargs.bait_file:
         self.pargs.bait_file = self.pargs.region_file
     self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools")
     pattern = "{}.bam$".format(self.pargs.hs_file_type)
     def filter_fn(f):
         return re.search(pattern, f) != None
     ### FIX ME: this isn't caught by _process_args
     path =  self.pargs.flowcell if self.pargs.flowcell else self.pargs.project
     flist = filtered_walk(os.path.join(self.config.get("production", "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'])
     if self.pargs.input_file:
         flist = [os.path.abspath(self.pargs.input_file)]
     if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         self.log.info("running CalculateHsMetrics on {}".format(f))
         ### Issue with calling java from
         ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module
         ### Actually not an issue: command line arguments have to be done the right way
         cl = ["java"] + ["-{}".format(self.pargs.java_opts)] +  ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.region_file))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.bait_file))] +  ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"]
         out = self.app.cmd.command(cl)
         if out:
             self.app._output_data["stdout"].write(out.rstrip())
Example #8
0
    def best_practice(self):
        if not self._check_pargs(["project", "uppmax_project"]):
            return
        project_path = os.path.normpath(
            os.path.join("/proj", self.pargs.uppmax_project))
        if not os.path.exists(project_path):
            self.log.warn("No such project {}; skipping".format(
                self.pargs.uppmax_project))
            return
        if self.pargs.outdir:
            outpath = os.path.join(project_path, "INBOX", self.pargs.outdir)
        else:
            outpath = os.path.join(
                project_path, "INBOX", self.pargs.statusdb_project_name
            ) if self.pargs.statusdb_project_name else os.path.join(
                project_path, "INBOX", self.pargs.project)
        if not query_yes_no(
                "Going to deliver data to {}; continue?".format(outpath)):
            return
        if not os.path.exists(outpath):
            self.app.cmd.safe_makedir(outpath)
        kw = vars(self.pargs)
        basedir = os.path.abspath(
            os.path.join(self._meta.root_path, self._meta.path_id))
        flist = find_samples(basedir, **vars(self.pargs))
        if not len(flist) > 0:
            self.log.info("No samples/sample configuration files found")
            return

        def filter_fn(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        # Setup pattern
        plist = [".*.yaml$", ".*.metrics$"]
        if not self.pargs.no_bam:
            plist.append(".*-{}.bam$".format(self.pargs.bam_file_type))
            plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type))
        if not self.pargs.no_vcf:
            plist.append(".*.vcf$")
            plist.append(".*.vcf.gz$")
            plist.append(".*.tbi$")
            plist.append(".*.tsv$")
        pattern = "|".join(plist)
        size = 0
        for f in flist:
            path = os.path.dirname(f)
            sources = filtered_walk(path,
                                    filter_fn=filter_fn,
                                    exclude_dirs=BCBIO_EXCLUDE_DIRS)
            targets = [src.replace(basedir, outpath) for src in sources]
            self._transfer_files(sources, targets)
            if self.pargs.size:
                statinfo = [os.stat(src).st_size for src in sources]
                size = size + sum(statinfo)
        if self.pargs.size:
            self.app._output_data['stderr'].write(
                "\n********************************\nEstimated delivery size: {:.1f}G\n********************************"
                .format(size / 1e9))
def get_file_copy_list(proj_base_dir, dest_proj_path, fcid, deliver_all_fcs, deliver_nophix, skip_list):
    to_copy = []
    for fqfile in filtered_walk(
        proj_base_dir, is_fastq, include_dirs=[fcid] if not deliver_all_fcs else None, exclude_dirs=skip_list
    ):

        # Get the run_name and sample_name from the path
        sample_name, run_name, _ = os.path.relpath(fqfile, proj_base_dir).split(os.sep, 2)
        date, fc_id = run_name.split("_")

        # Skip if we deliver from nophix and the parent dir is not nophix (or vice versa)
        pdir = os.path.basename(os.path.dirname(fqfile))
        if deliver_nophix and pdir != "nophix":
            continue
        if not deliver_nophix and pdir != run_name:
            continue

        # Skip if a compressed version of the current file exists
        if os.path.exists("{:s}.gz".format(fqfile)):
            print (
                "WARNING: Both compressed and non-compressed versions of {:s} exists! "
                "Is compression/decompression in progress? Will deliver compressed version "
                "but you should make sure that the delivered files are complete!".format(fqfile)
            )
            continue

        print ("DEBUG: source_delivery_path = {:s}".format(os.path.dirname(fqfile)))

        fname = os.path.basename(fqfile)
        print (fname)

        dest_run_path = os.path.join(dest_proj_path, sample_name, run_name)
        dest_file_name = create_final_name(fname, date, fc_id, sample_name)
        to_copy.append([fqfile, dest_run_path, dest_file_name])
    return to_copy
Example #10
0
 def test_filtered_walk_include_exclude(self):
     """Perform a filtered walk of data dir, using include_dirs and exclude_dirs restriction"""
     flist = filtered_walk("data",
                           filter_fn=self.filter_fn,
                           include_dirs=["nophix"],
                           exclude_dirs=["fastqc"])
     self.assertEqual(set(flist), set(['data/nophix/file1.txt']))
Example #11
0
 def hs_metrics(self):
     if not self._check_pargs(["project", "targets"]):
         return
     if not self.pargs.baits:
         self.pargs.baits = self.pargs.targets
     self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools")
     pattern = "{}.bam$".format(self.pargs.hs_file_type)
     def filter_fn(f):
         return re.search(pattern, f) != None
     ### FIX ME: this isn't caught by _process_args
     flist = []
     path =  self.pargs.flowcell if self.pargs.flowcell else self.pargs.project
     basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id))
     samples = find_samples(basedir, **vars(self.pargs))
     inc_dirs = [os.path.dirname(x) for x in samples]
     flist = filtered_walk(os.path.join(self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs)
     if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         self.log.info("running CalculateHsMetrics on {}".format(f))
         ### Issue with calling java from
         ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module
         ### Actually not an issue: command line arguments have to be done the right way
         cl = ["java"] + ["-{}".format(self.pargs.java_opts)] +  ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.targets))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))] +  ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"]
         out = self.app.cmd.command(cl)
         if out:
             self.app._output_data["stdout"].write(out.rstrip())
Example #12
0
def remove_files(f, **kw):
    ## Remove old files if requested
    keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$",
                  "^[0-9][0-9]_.*.txt$", "JOBID", "PID"]
    pattern = "|".join(keep_files)
    def remove_filter_fn(f):
        return re.search(pattern, f) == None

    workdir = os.path.dirname(f)
    remove_files = filtered_walk(workdir, remove_filter_fn)
    remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
    if len(remove_files) == 0:
        pass
    if len(remove_files) > 0 and query_yes_no("Going to remove {} files and {} directories... Are you sure you want to continue?".format(len(remove_files), len(remove_dirs)), force=kw['force']):
        [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files]
        ## Sort directories by length so we don't accidentally try to remove a non-empty dir
        [dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)]
Example #13
0
    def _to_casava_structure(self, fc):
        transfer_status = {}
        outdir_pfx = os.path.abspath(os.path.join(self.app.config.get("project", "root"), self.pargs.project, "data"))
        if self.pargs.transfer_dir:
            outdir_pfx = os.path.abspath(
                os.path.join(self.app.config.get("project", "root"), self.pargs.transfer_dir, "data")
            )
        for sample in fc:
            key = "{}_{}".format(sample["lane"], sample["sequence"])
            sources = {"files": self._prune_sequence_files(sample["files"]), "results": sample["results"]}
            outdir = os.path.join(outdir_pfx, sample["name"], fc.fc_id())
            dirs = {
                "data": os.path.abspath(os.path.join(outdir_pfx, sample["name"], fc.fc_id())),
                "intermediate": os.path.abspath(os.path.join(outdir_pfx, sample["name"], fc.fc_id())),
            }
            self._make_output_dirs(dirs)
            fc_new = fc.subset("lane", sample["lane"]).subset("name", sample["name"])
            targets = {
                "files": [src.replace(fc.path, dirs["data"]) for src in sources["files"]],
                "results": [src.replace(fc.path, dirs["intermediate"]) for src in sources["results"]],
            }

            fc_new.lane_files = dict(
                (k, [os.path.join(outdir, os.path.basename(x)) for x in v]) for k, v in fc_new.lane_files.items()
            )
            fc_new.set_entry(key, "files", targets["files"])
            fc_new.set_entry(key, "results", targets["results"])
            ## Copy sample files - currently not doing lane files
            self._transfer_files(sources, targets)
            self.app.cmd.write(
                os.path.join(dirs["data"], "{}-bcbb-pm-config.yaml".format(sample["name"])), fc_new.as_yaml()
            )
            transfer_status[sample["name"]] = {"files": len(sources["files"]), "results": len(sources["results"])}
        ## Rewrite platform_args; only keep time, workdir, account, partition, outpath and jobname
        pattern = "-post_process.yaml$"

        def pp_yaml_filter(f):
            return re.search(pattern, f) != None

        ppfiles = filtered_walk(dirs["data"], pp_yaml_filter)
        for pp in ppfiles:
            self.app.log.debug("Rewriting platform args for {}".format(pp))
            with open(pp, "r") as fh:
                conf = yaml.load(fh)
            if not conf:
                self.app.log.warn("No configuration for {}".format(pp))
                continue
            newconf = prune_pp_platform_args(conf)
            if newconf == conf:
                continue
            self.app.cmd.safe_unlink(pp)
            self.app.cmd.write(pp, yaml.safe_dump(newconf, default_flow_style=False, allow_unicode=True, width=1000))

        # Write transfer summary
        self.app._output_data["stderr"].write("Transfer summary\n")
        self.app._output_data["stderr"].write("{:<18}{:>18}{:>18}\n".format("Sample", "Transferred files", "Results"))
        for k, v in transfer_status.iteritems():
            self.app._output_data["stderr"].write("{:<18}{:>18}{:>18}\n".format(k, v["files"], v["results"]))
Example #14
0
 def test_filtered_walk(self):
     """Perform a filtered walk of data dir"""
     flist = filtered_walk("data", filter_fn=self.filter_fn)
     self.assertEqual(
         set(flist),
         set([
             'data/file1.txt', 'data/alignments/file1.txt',
             'data/nophix/file1.txt', 'data/nophix/fastqc/file1.txt',
             'data/fastqc/file1.txt', 'data/fastqc/nophix/file1.txt'
         ]))
Example #15
0
 def ls(self):
     if self._meta.path_id == "":
         self._ls(self._meta.root_path, filter_output=True)
     else:
         if self._meta.file_ext:
             pattern = "|".join(["{}$".format(x) for x in self._meta.file_ext])
             flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), file_filter)
             if flist:
                 self.app._output_data["stdout"].write("\n".join(flist))
         else:
             self._ls(os.path.join(self._meta.root_path, self._meta.path_id))
Example #16
0
 def test_remove_files(self):
     """Test removing files"""
     keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$"]
     pattern = "|".join(keep_files)
     def remove_filter_fn(f):
         return re.search(pattern, f) == None
     flist = find_samples(j_doe_00_05)
     for f in flist:
         workdir = os.path.dirname(f)
         remove_files = filtered_walk(workdir, remove_filter_fn)
         self.assertNotIn("01_analysis_start.txt", [os.path.basename(x) for x in remove_files])
Example #17
0
 def test_remove_dirs(self):
     """Test removing directories before rerunning pipeline"""
     keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$"]
     pattern = "|".join(keep_files)
     def remove_filter_fn(f):
         return re.search(pattern, f) == None
     flist = find_samples(j_doe_00_05)
     for f in flist:
         workdir = os.path.dirname(f)
         remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
         self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
Example #18
0
 def ls(self):
     if self._meta.path_id == "":
         self._ls(self._meta.root_path, filter_output=True)
     else:
         if self._meta.file_ext:
             pattern = "|".join(["{}$".format(x) for x in self._meta.file_ext])
             flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), file_filter)
             if flist:
                 self.app._output_data["stdout"].write("\n".join(flist))
         else:
             self._ls(os.path.join(self._meta.root_path, self._meta.path_id))
Example #19
0
 def test_filtered_walk_exclude(self):
     """Perform a filtered walk of data dir, using exclude_dirs restriction"""
     flist = filtered_walk("data",
                           filter_fn=self.filter_fn,
                           exclude_dirs=["nophix"])
     self.assertEqual(
         set(flist),
         set([
             'data/file1.txt', 'data/alignments/file1.txt',
             'data/fastqc/file1.txt'
         ]))
Example #20
0
    def remove_finished(self):
        if not self._check_pargs(["project"]):
            return
        # Don't filter out files
        def filter_fn(f):
            return True

        slist = os.listdir(
            os.path.join(self._meta.root_path, self._meta.path_id))
        for s in slist:
            spath = os.path.join(self._meta.root_path, self._meta.path_id, s)
            if not os.path.isdir(spath):
                continue
            if not os.path.exists(os.path.join(spath, FINISHED_FILE)):
                self.app.log.info("Sample {} not finished; skipping".format(s))
                continue
            flist = filtered_walk(spath, filter_fn)
            dlist = filtered_walk(spath, filter_fn, get_dirs=True)
            if os.path.exists(os.path.join(spath, REMOVED_FILE)):
                self.app.log.info(
                    "Sample {} already removed; skipping".format(s))
                continue
            if len(flist) > 0 and not query_yes_no(
                    "Will remove directory {} containing {} files; continue?".
                    format(s, len(flist)),
                    force=self.pargs.force):
                continue
            self.app.log.info("Removing {} files from {}".format(
                len(flist), spath))
            for f in flist:
                if f == os.path.join(spath, FINISHED_FILE):
                    continue
                self.app.cmd.safe_unlink(f)
            self.app.log.info("Removing {} directories from {}".format(
                len(dlist), spath))
            for d in sorted(dlist, reverse=True):
                self.app.cmd.safe_rmdir(d)
            if not self.pargs.dry_run:
                with open(os.path.join(spath, REMOVED_FILE), "w") as fh:
                    t_utc = utc_time()
                    fh.write(t_utc)
Example #21
0
 def test_filtered_walk_include(self):
     """Perform a filtered walk of data dir, using include_dirs restriction"""
     self.pattern = "file2.txt"
     flist = filtered_walk("data",
                           filter_fn=self.filter_fn,
                           include_dirs=["nophix"])
     self.assertEqual(
         set(flist),
         set([
             'data/nophix/file2.txt', 'data/nophix/fastqc/file2.txt',
             'data/fastqc/nophix/file2.txt'
         ]))
Example #22
0
 def test_casava_transfer(self):
     """Test transfer of casava data from production to project"""
     self.app = self.make_app(argv = ['production', 'transfer', 'J.Doe_00_03', '--debug', '--force', '--quiet'], extensions=[])
     handler.register(ProductionController)
     self._run_app()
     os.chdir(filedir)
     j_doe_00_03 = os.path.abspath(os.path.join(filedir, "data", "projects", "j_doe_00_03"))
     pattern = ".fastq(.gz)?$"
     def fastq_filter(f):
         return re.search(pattern, f) != None
     fastq_files = filtered_walk(j_doe_00_03, fastq_filter)
     self.assertEqual(len(fastq_files), 2)
Example #23
0
 def setUpClass(cls):
     if not os.getcwd() == filedir:
         os.chdir(filedir)
     LOG.info("Copy tree {} to {}".format(j_doe_00_01, j_doe_00_04))
     if not os.path.exists(j_doe_00_04):
         shutil.copytree(j_doe_00_01, j_doe_00_04)
     pattern = "-bcbb-config.yaml$"
     def yaml_filter(f):
         return re.search(pattern, f) != None
     yaml_files = filtered_walk(j_doe_00_04, yaml_filter)
     with open(SAMPLEFILE, "w") as fh:
         fh.write("\n".join(yaml_files[0:1]))
Example #24
0
 def clean(self):
     if not self._check_pargs(["project"]):
         return
     self._meta.pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_ext])
     flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn, include_dirs=self._meta.include_dirs)
     if len(flist) == 0:
         self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
         return
     if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
         return
     for f in flist:
         self.app.log.info("removing {}".format(f))
         self.app.cmd.safe_unlink(f)
Example #25
0
 def clean(self):
     if not self._check_pargs(["project"]):
         return
     self._meta.pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_ext])
     flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn, include_dirs=self._meta.include_dirs)
     if len(flist) == 0:
         self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
         return
     if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
         return
     for f in flist:
         self.app.log.info("removing {}".format(f))
         self.app.cmd.safe_unlink(f)
Example #26
0
    def _compress(self, label="compress"):
        if self.pargs.input_file:
            flist = [self.pargs.input_file]
        else:
            flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn)

        if len(flist) == 0:
            self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            sys.exit()
        for f in flist:
            self.log.info("{}ing {}".format(label, f))
            self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True, **{'workingDirectory':os.path.dirname(f), 'outputPath':os.path.join(os.path.dirname(f), "{}-{}-drmaa.log".format(label, os.path.basename(f)))})
Example #27
0
    def _compress(self, label="compress"):
        if self.pargs.input_file:
            flist = [self.pargs.input_file]
        else:
            flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), self._filter_fn)

        if len(flist) == 0:
            self.app.log.info("No files matching pattern '{}' found".format(self._meta.pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            sys.exit()
        for f in flist:
            self.log.info("{}ing {}".format(label, f))
            self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True, **{'workingDirectory':os.path.dirname(f), 'outputPath':os.path.join(os.path.dirname(f), "{}-{}-drmaa.log".format(label, os.path.basename(f)))})
Example #28
0
    def setUpClass(cls):
        if not os.getcwd() == filedir:
            os.chdir(filedir)
        LOG.info("Copy tree {} to {}".format(j_doe_00_01, j_doe_00_04))
        if not os.path.exists(j_doe_00_04):
            shutil.copytree(j_doe_00_01, j_doe_00_04)
        pattern = "-bcbb-config.yaml$"

        def yaml_filter(f):
            return re.search(pattern, f) != None

        yaml_files = filtered_walk(j_doe_00_04, yaml_filter)
        with open(SAMPLEFILE, "w") as fh:
            fh.write("\n".join(yaml_files[0:1]))
Example #29
0
 def best_practice(self):
     if not self._check_pargs(["project", "uppmax_project"]):
         return
     project_path = os.path.normpath(os.path.join("/proj", self.pargs.uppmax_project))
     if not os.path.exists(project_path):
         self.log.warn("No such project {}; skipping".format(self.pargs.uppmax_project))
         return
     if self.pargs.outdir:
         outpath = os.path.join(project_path, "INBOX", self.pargs.outdir)
     else:
         outpath = os.path.join(project_path, "INBOX", self.pargs.statusdb_project_name) if self.pargs.statusdb_project_name else os.path.join(project_path, "INBOX", self.pargs.project)
     if not query_yes_no("Going to deliver data to {}; continue?".format(outpath)):
         return
     if not os.path.exists(outpath):
         self.app.cmd.safe_makedir(outpath)
     kw = vars(self.pargs)
     basedir = os.path.abspath(os.path.join(self._meta.root_path, self._meta.path_id))
     flist = find_samples(basedir, **vars(self.pargs))
     if self.pargs.flowcell:
         flist = [ fl for fl in flist if os.path.basename(os.path.dirname(fl)) == self.pargs.flowcell ]
     if not len(flist) > 0:
         self.log.info("No samples/sample configuration files found")
         return
     def filter_fn(f):
         if not pattern:
             return
         return re.search(pattern, f) != None
     # Setup pattern
     plist = [".*.yaml$", ".*.metrics$"]
     if not self.pargs.no_bam:
         plist.append(".*-{}.bam$".format(self.pargs.bam_file_type))
         plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type))
     if not self.pargs.no_vcf:
         plist.append(".*.vcf$")
         plist.append(".*.vcf.gz$")
         plist.append(".*.tbi$")
         plist.append(".*.tsv$")
     pattern = "|".join(plist)
     size = 0
     for f in flist:
         path = os.path.dirname(f)
         sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS)
         targets = [src.replace(basedir, outpath) for src in sources]
         self._transfer_files(sources, targets)
         if self.pargs.size:
             statinfo = [os.stat(src).st_size for src in sources]
             size = size + sum(statinfo)
     if self.pargs.size:
         self.app._output_data['stderr'].write("\n********************************\nEstimated delivery size: {:.1f}G\n********************************".format(size/1e9))
Example #30
0
 def _from_casava_structure(self):
     """Get information from casava structure"""
     if not self._check_pargs(["project"]):
         return
     fc_list = []
     pattern = "-bcbb-config.yaml$"
     def bcbb_yaml_filter(f):
         return re.search(pattern, f) != None
     samples = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), bcbb_yaml_filter)
     for s in samples:
         fc = Flowcell(s)
         fc_new = fc.subset("sample_prj", self.pargs.project)
         fc_new.collect_files(os.path.dirname(s))        
         fc_list.append(fc_new)
     return fc_list
Example #31
0
 def _from_casava_structure(self):
     """Get information from casava structure"""
     if not self._check_pargs(["project"]):
         return
     fc_list = []
     pattern = "-bcbb-config.yaml$"
     def bcbb_yaml_filter(f):
         return re.search(pattern, f) != None
     samples = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), bcbb_yaml_filter)
     for s in samples:
         fc = Flowcell(s)
         fc_new = fc.subset("sample_prj", self.pargs.project)
         fc_new.collect_files(os.path.dirname(s))        
         fc_list.append(fc_new)
     return fc_list
Example #32
0
    def clean(self):
        pattern = "|".join(["{}(.gz|.bz2)?$".format(x) for x in self._meta.file_pat])
        def clean_filter(f):
            if not pattern:
                return
            return re.search(pattern , f) != None

        flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), clean_filter, include_dirs=self._meta.include_dirs)
        if len(flist) == 0:
            self.app.log.info("No files matching pattern {} found".format(pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to remove {} files ({}...). Are you sure you want to continue?".format(len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            return
        for f in flist:
            self.app.log.info("removing {}".format(f))
            self.app.cmd.safe_unlink(f)
def get_report_copy_list(proj_name, reportpath, dest_proj_path, sample_copy_list):
    
     to_copy=[]
     fcid = get_run_info(sample_copy_list)
     pdf_list=filtered_walk(reportpath,is_pdf)    
     project_report_name = proj_name+'_project_summary.pdf'
 
     for report in pdf_list:
         if report.split('/')[-1] == project_report_name:
             to_copy.append([report, dest_proj_path, project_report_name])
         for flowcell in fcid:
             sample_report_name =  proj_name+'_' + flowcell + '_sample_summary.pdf'
             if report.split('/')[-1] == sample_report_name:
                 to_copy.append([report, dest_proj_path, sample_report_name])
 
     return sample_copy_list+to_copy
Example #34
0
def get_report_copy_list(proj_name, reportpath, dest_proj_path,
                         sample_copy_list):

    to_copy = []
    fcid = get_run_info(sample_copy_list)
    pdf_list = filtered_walk(reportpath, is_pdf)
    project_report_name = proj_name + '_project_summary.pdf'

    for report in pdf_list:
        if report.split('/')[-1] == project_report_name:
            to_copy.append([report, dest_proj_path, project_report_name])
        for flowcell in fcid:
            sample_report_name = proj_name + '_' + flowcell + '_sample_summary.pdf'
            if report.split('/')[-1] == sample_report_name:
                to_copy.append([report, dest_proj_path, sample_report_name])

    return sample_copy_list + to_copy
Example #35
0
 def run(self):
     if not self._check_pargs(["project", "post_process", "analysis_type"]):
         return
     ## Gather sample yaml files
     pattern = "-bcbb-config.yaml$"
     flist = []
     if self.pargs.sample:
         if os.path.exists(self.pargs.sample):
             with open(self.pargs.sample) as fh:
                 flist = [x.rstrip() for x in fh.readlines()]
         else:
             pattern = "{}{}".format(self.pargs.sample, pattern)
     def bcbb_yaml_filter(f):
         return re.search(pattern, f) != None
     if not flist:
         flist = filtered_walk(os.path.join(self.app.controller._meta.project_root, self.pargs.project, "data"), bcbb_yaml_filter)
     if self.pargs.only_failed:
         status = {x:self._sample_status(x) for x in flist}
         flist = [x for x in flist if self._sample_status(x)=="FAIL"]
     if len(flist) == 0 and self.pargs.sample:
         self.app.log.info("No such sample {}".format(self.pargs.sample))
     if len(flist) > 0 and not query_yes_no("Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force):
         return
     for f in flist:
         with open(f) as fh:
             config = yaml.load(fh)
         if self.pargs.analysis_type:
             config["details"][0]["multiplex"][0]["analysis"] = self.pargs.analysis_type
             config["details"][0]["analysis"] = self.pargs.analysis_type
         if config["details"][0]["genome_build"] == 'unknown':
             config["details"][0]["genome_build"] = self.pargs.genome_build
         ## Check if files exist: if they don't, then change the suffix
         config["details"][0]["multiplex"][0]["files"].sort()
         if not os.path.exists(config["details"][0]["multiplex"][0]["files"][0]):
             if os.path.splitext(config["details"][0]["multiplex"][0]["files"][0])[1] == ".gz":
                 config["details"][0]["multiplex"][0]["files"] = [x.replace(".gz", "") for x in config["details"][0]["multiplex"][0]["files"]]
             else:
                 config["details"][0]["multiplex"][0]["files"] = ["{}.gz".format(x) for x in config["details"][0]["multiplex"][0]["files"]]
         config_file = f.replace("-bcbb-config.yaml", "-pm-bcbb-analysis-config.yaml")
         self.app.cmd.write(config_file, yaml.dump(config))
         ## Run automated_initial_analysis.py
         cur_dir = os.getcwd()
         new_dir = os.path.abspath(os.path.dirname(f))
         os.chdir(new_dir)
         self.app.cmd.command(['automated_initial_analysis.py', os.path.abspath(self.pargs.post_process), new_dir, config_file])
         os.chdir(cur_dir)
Example #36
0
    def collect_files(self, path, project=None):
        """Collect files for a given project.

        FIXME: does not work entirely for casava-like folder structure"""
        if project:
            fc = self.subset("sample_prj", project)
        else:
            fc = self
        pattern = "|".join(fc.glob_pfx_str())
        def file_filter(f):
            if not pattern:
                return
            return re.search(pattern, f) != None
        flist = filtered_walk(path, file_filter)
        for f in flist:
            self.classify_file(f)
        fc.path = path
        return fc
Example #37
0
    def test_casava_transfer(self):
        """Test transfer of casava data from production to project"""
        self.app = self.make_app(argv=[
            'production', 'transfer', 'J.Doe_00_03', '--debug', '--force',
            '--quiet'
        ],
                                 extensions=[])
        handler.register(ProductionController)
        self._run_app()
        os.chdir(filedir)
        j_doe_00_03 = os.path.abspath(
            os.path.join(filedir, "data", "projects", "j_doe_00_03"))
        pattern = ".fastq(.gz)?$"

        def fastq_filter(f):
            return re.search(pattern, f) != None

        fastq_files = filtered_walk(j_doe_00_03, fastq_filter)
        self.assertEqual(len(fastq_files), 2)
Example #38
0
    def _compress(self, pattern, label="compress"):
        def compress_filter(f):
            if not pattern:
                return
            return re.search(pattern, f) != None

        if self.pargs.input_file:
            flist = [self.pargs.input_file]
        else:
            flist = filtered_walk(os.path.join(self._meta.root_path, self._meta.path_id), compress_filter)

        if len(flist) == 0:
            self.app.log.info("No files matching pattern {} found".format(pattern))
            return
        if len(flist) > 0 and not query_yes_no("Going to {} {} files ({}...). Are you sure you want to continue?".format(label, len(flist), ",".join([os.path.basename(x) for x in flist[0:10]])), force=self.pargs.force):
            sys.exit()
        for f in flist:
            self.log.info("{}ing {}".format(label, f))
            self.app.cmd.command([self._meta.compress_prog, self._meta.compress_opt, "%s" % f], label, ignore_error=True)
Example #39
0
    def test_remove_files(self):
        """Test removing files"""
        keep_files = [
            "-post_process.yaml$", "-post_process.yaml.bak$",
            "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",
            "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$",
            "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$"
        ]
        pattern = "|".join(keep_files)

        def remove_filter_fn(f):
            return re.search(pattern, f) == None

        flist = find_samples(j_doe_00_05)
        for f in flist:
            workdir = os.path.dirname(f)
            remove_files = filtered_walk(workdir, remove_filter_fn)
            self.assertNotIn("01_analysis_start.txt",
                             [os.path.basename(x) for x in remove_files])
Example #40
0
    def collect_files(self, path, project=None):
        """Collect files for a given project.

        :param path: path to search in 
        """
        if project:
            fc = self.subset("sample_prj", project)
        else:
            fc = self
        pattern = "|".join(fc.glob_pfx_str())
        def file_filter(f):
            if not pattern:
                return
            return re.search(pattern, f) != None
        flist = filtered_walk(path, file_filter)
        for f in flist:
            self.classify_file(f)
        fc.path = path
        return fc
Example #41
0
    def test_remove_dirs(self):
        """Test removing directories before rerunning pipeline"""
        keep_files = [
            "-post_process.yaml$", "-post_process.yaml.bak$",
            "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",
            "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$",
            "_[0-9]+.fastq.gz$"
        ]
        pattern = "|".join(keep_files)

        def remove_filter_fn(f):
            return re.search(pattern, f) == None

        flist = find_samples(j_doe_00_05)
        for f in flist:
            workdir = os.path.dirname(f)
            remove_dirs = filtered_walk(workdir,
                                        remove_filter_fn,
                                        get_dirs=True)
            self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
def get_file_copy_list(proj_base_dir, dest_proj_path, fcid, deliver_all_fcs,
                       deliver_nophix, skip_list):
    to_copy = []
    for fqfile in filtered_walk(
            proj_base_dir,
            is_fastq,
            include_dirs=[fcid] if not deliver_all_fcs else None,
            exclude_dirs=skip_list):

        # Get the run_name and sample_name from the path
        sample_name, run_name, _ = os.path.relpath(fqfile,
                                                   proj_base_dir).split(
                                                       os.sep, 2)
        date, fc_id = run_name.split('_')

        # Skip if we deliver from nophix and the parent dir is not nophix (or vice versa)
        pdir = os.path.basename(os.path.dirname(fqfile))
        if deliver_nophix and pdir != "nophix":
            continue
        if not deliver_nophix and pdir != run_name:
            continue

        # Skip if a compressed version of the current file exists
        if os.path.exists("{:s}.gz".format(fqfile)):
            print("WARNING: Both compressed and non-compressed versions of {:s} exists! " \
                  "Is compression/decompression in progress? Will deliver compressed version " \
                  "but you should make sure that the delivered files are complete!".format(fqfile))
            continue

        print("DEBUG: source_delivery_path = {:s}".format(
            os.path.dirname(fqfile)))

        fname = os.path.basename(fqfile)
        print(fname)

        dest_run_path = os.path.join(dest_proj_path, sample_name, run_name)
        dest_file_name = create_final_name(fname, date, fc_id, sample_name)
        to_copy.append([fqfile, dest_run_path, dest_file_name])
    return to_copy
Example #43
0
def purge_alignments(path,
                     ftype="sam",
                     keep="last",
                     dry_run=False,
                     force=False,
                     fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug(
        "running purge_alignments in path {} with pattern {} keep rule {}".
        format(path, pattern, keep))

    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None

    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no(
            "Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?"
            .format(len(flist), ftype, ",".join(
                [os.path.basename(x) for x in flist[0:10]])),
            force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(
                    f, "File removed to save disk space: SAM converted to BAM",
                    dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]

            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f)
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x, y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info(
                        "Keeping file {} and removing all files with common prefix: {}"
                        .format(
                            os.path.basename(files[len(files) - 1]), ", ".join(
                                [os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run,
                                              int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size /
                                                                1e9))
Example #44
0
 def test_filtered_walk_get_dirs(self):
     """Perform a filtered walk of data dir, getting dirs"""
     flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"], exclude_dirs=["fastqc"], get_dirs=True)
     self.assertEqual(set(flist), set([]))
     flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"], exclude_dirs=["fastqc"], get_dirs=False)
     self.assertEqual(set(flist), set(['data/nophix/file1.txt']))
Example #45
0
 def test_filtered_walk_include_exclude(self):
     """Perform a filtered walk of data dir, using include_dirs and exclude_dirs restriction"""
     flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"], exclude_dirs=["fastqc"])
     self.assertEqual(set(flist), set(['data/nophix/file1.txt']))
Example #46
0
 def test_filtered_walk_exclude(self):
     """Perform a filtered walk of data dir, using exclude_dirs restriction"""
     flist = filtered_walk("data", filter_fn=self.filter_fn, exclude_dirs=["nophix"])
     self.assertEqual(set(flist), set(['data/file1.txt', 'data/alignments/file1.txt', 'data/fastqc/file1.txt']))
Example #47
0
 def test_filtered_walk_include(self):
     """Perform a filtered walk of data dir, using include_dirs restriction"""
     self.pattern = "file2.txt"
     flist = filtered_walk("data", filter_fn=self.filter_fn, include_dirs=["nophix"])
     self.assertEqual(set(flist), set(['data/nophix/file2.txt', 'data/nophix/fastqc/file2.txt', 'data/fastqc/nophix/file2.txt']))
Example #48
0
 def test_filtered_walk(self):
     """Perform a filtered walk of data dir"""
     flist = filtered_walk("data", filter_fn=self.filter_fn)
     self.assertEqual(set(flist), set(['data/file1.txt', 'data/alignments/file1.txt', 'data/nophix/file1.txt', 'data/nophix/fastqc/file1.txt', 'data/fastqc/file1.txt', 'data/fastqc/nophix/file1.txt']))
Example #49
0
    def _to_casava_structure(self, fc):
        transfer_status = {}
        outdir_pfx = os.path.abspath(
            os.path.join(self.app.config.get("project", "root"),
                         self.pargs.project, "data"))
        if self.pargs.transfer_dir:
            outdir_pfx = os.path.abspath(
                os.path.join(self.app.config.get("project", "root"),
                             self.pargs.transfer_dir, "data"))
        for sample in fc:
            key = "{}_{}".format(sample['lane'], sample['sequence'])
            sources = {
                "files": self._prune_sequence_files(sample['files']),
                "results": sample['results']
            }
            outdir = os.path.join(outdir_pfx, sample['name'], fc.fc_id())
            dirs = {
                "data":
                os.path.abspath(
                    os.path.join(outdir_pfx, sample['name'], fc.fc_id())),
                "intermediate":
                os.path.abspath(
                    os.path.join(outdir_pfx, sample['name'], fc.fc_id()))
            }
            self._make_output_dirs(dirs)
            fc_new = fc.subset("lane",
                               sample['lane']).subset("name", sample['name'])
            targets = {
                "files": [
                    src.replace(fc.path, dirs["data"])
                    for src in sources['files']
                ],
                "results": [
                    src.replace(fc.path, dirs["intermediate"])
                    for src in sources['results']
                ]
            }

            fc_new.lane_files = dict(
                (k, [os.path.join(outdir, os.path.basename(x)) for x in v])
                for k, v in fc_new.lane_files.items())
            fc_new.set_entry(key, 'files', targets['files'])
            fc_new.set_entry(key, 'results', targets['results'])
            ## Copy sample files - currently not doing lane files
            self._transfer_files(sources, targets)
            self.app.cmd.write(
                os.path.join(dirs["data"],
                             "{}-bcbb-pm-config.yaml".format(sample['name'])),
                fc_new.as_yaml())
            transfer_status[sample['name']] = {
                'files': len(sources['files']),
                'results': len(sources['results'])
            }
        ## Rewrite platform_args; only keep time, workdir, account, partition, outpath and jobname
        pattern = "-post_process.yaml$"

        def pp_yaml_filter(f):
            return re.search(pattern, f) != None

        ppfiles = filtered_walk(dirs["data"], pp_yaml_filter)
        for pp in ppfiles:
            self.app.log.debug("Rewriting platform args for {}".format(pp))
            with open(pp, "r") as fh:
                conf = yaml.load(fh)
            if not conf:
                self.app.log.warn("No configuration for {}".format(pp))
                continue
            newconf = prune_pp_platform_args(conf)
            if newconf == conf:
                continue
            self.app.cmd.safe_unlink(pp)
            self.app.cmd.write(
                pp,
                yaml.safe_dump(newconf,
                               default_flow_style=False,
                               allow_unicode=True,
                               width=1000))

        # Write transfer summary
        self.app._output_data["stderr"].write("Transfer summary\n")
        self.app._output_data["stderr"].write("{:<18}{:>18}{:>18}\n".format(
            "Sample", "Transferred files", "Results"))
        for k, v in transfer_status.iteritems():
            self.app._output_data["stderr"].write(
                "{:<18}{:>18}{:>18}\n".format(k, v['files'], v['results']))
Example #50
0
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep))
    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None
    
    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]
            
            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f) 
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files  in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x,y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
Example #51
0
def flowcell_remove_status(archive_dir, swestore_dir, to_remove="to_remove"):
    """This function looks for flowcells that could be deleted
    from archive and returns a list of flowcells with a KEEP/RM
    flag. The rules are
    
    1. the flowcell is in archive to_remove file
    2. pbzip ran without error
    3. the tarball filesize looks ok
    4. checksum irods is ok

    :param archive_dir: archive directory
    :param swestore_dir: base dir for swestore
    :param to_remove: to remove file name
    """
    output_data = {'stdout':StringIO(), 'stderr':StringIO()}
    ## Check for ils
    try:
        proc = subprocess.Popen(["ils"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = proc.communicate()
        proc.wait()
        proc = subprocess.Popen(["icd", os.path.basename(os.path.dirname(archive_dir))],  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = proc.communicate()
        proc.wait()
    except:
        LOG.warn("No such command 'ils': please load the irods module" )
        return output_data
    ## make flowcell dictionary based on to_remove contents
    to_remove_file = os.path.join(archive_dir, to_remove)
    with open(to_remove_file) as fh:
        remove_list = fh.readlines()
    flowcells = {k.replace("./", "").rstrip():{'in_archive':False, 'pbzip_exit':1, 'tarball_size':0, 'irods_checksum':1} for k in remove_list if k.rstrip() != ''}

    ## Look for compress logs
    pattern = "slurm.*.out$"
    def compress_fn(f):
        return re.search(pattern, f) != None
    compress_log_files = filtered_walk(os.path.join(archive_dir, "compress_logs"), compress_fn)
    for f in compress_log_files:
        with open(f) as fh:
            compress_str = "".join([x.strip() for x in fh.readlines()])
        m = re.search("Compressing[ ]+([0-9A-Za-z_\-]+)\.\.\..*Exit code:[ ]+([0-9]+)", compress_str)
        if m:
            if not m.groups()[0] in flowcells.keys():
                LOG.warn("flowcell {} present in to_remove but not in archive".format(m.groups()[0]))
            else:
                flowcells[m.groups()[0]]['pbzip_exit'] = m.groups()[1]
        else:
            LOG.warn("{}: no match for {}".format(f, compress_str))

    ## Get tarball sizes and check if in archive 
    ## Loop through flowcells and perform ichksum
    for k in flowcells.keys():
        LOG.debug("Getting tarball size, archive presence and ichksum for {}".format(k))
        fcdir = os.path.join(archive_dir, k)
        if os.path.exists(fcdir):
            flowcells[k]['in_archive'] = True
        fctar = os.path.join(swestore_dir, "drophere2archive", "{}.tar.bz2".format(k))
        try:
            cl = ["ichksum", os.path.basename(fctar)]
            proc = subprocess.Popen(cl, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            (stdout, stderr) = proc.communicate()
            proc.wait()
            flowcells[k]['irods_checksum'] = stdout.split("\n")[1]
        except:
            LOG.warn("command {} failed".format(" ".join(cl)))
        if not os.path.exists(fctar):
            continue
        else:
            LOG.debug("tarball exists: {}".format(fctar))
            statinfo = os.stat(fctar)
            flowcells[k]['tarball_size'] = float(int(statinfo.st_size) / 1e9)

    output_data["stdout"].write("\nFlowcell archive status\n")
    output_data["stdout"].write("=======================\n")
    output_data["stdout"].write("\nThe table lists those flowcells still present in archive. The exict code for pbzip should be 0\nfor success. A non-existing tarball has size 0.\n\n")
    output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format("Flowcell", "pbzip_exit", "tarball_size (G)", 'irods_checksum'))
    output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format("--------", "----------", "----------------", '--------------'))

    for k in sorted(flowcells.keys()):
        if not flowcells[k]['in_archive']:
            continue
        output_data["stdout"].write("{:<40}{:>12}{:>20.2f}{:>60}\n".format(k, flowcells[k]['pbzip_exit'], flowcells[k]['tarball_size'], flowcells[k]['irods_checksum'] ))
    return output_data
Example #52
0
def flowcell_remove_status(archive_dir, swestore_dir, to_remove="to_remove"):
    """This function looks for flowcells that could be deleted
    from archive and returns a list of flowcells with a KEEP/RM
    flag. The rules are

    1. the flowcell is in archive to_remove file
    2. pbzip ran without error
    3. the tarball filesize looks ok
    4. checksum irods is ok

    :param archive_dir: archive directory
    :param swestore_dir: base dir for swestore
    :param to_remove: to remove file name
    """
    output_data = {'stdout': StringIO(), 'stderr': StringIO()}
    ## Check for ils
    try:
        proc = subprocess.Popen(["ils"],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        (stdout, stderr) = proc.communicate()
        proc.wait()
        proc = subprocess.Popen(
            ["icd", os.path.basename(os.path.dirname(archive_dir))],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        (stdout, stderr) = proc.communicate()
        proc.wait()
    except:
        LOG.warn("No such command 'ils': please load the irods module")
        return output_data
    ## make flowcell dictionary based on to_remove contents
    to_remove_file = os.path.join(archive_dir, to_remove)
    with open(to_remove_file) as fh:
        remove_list = fh.readlines()
    flowcells = {
        k.replace("./", "").rstrip(): {
            'in_archive': False,
            'pbzip_exit': 1,
            'tarball_size': 0,
            'irods_checksum': 1
        }
        for k in remove_list if k.rstrip() != ''
    }

    ## Look for compress logs
    pattern = "slurm.*.out$"

    def compress_fn(f):
        return re.search(pattern, f) != None

    compress_log_files = filtered_walk(
        os.path.join(archive_dir, "compress_logs"), compress_fn)
    for f in compress_log_files:
        with open(f) as fh:
            compress_str = "".join([x.strip() for x in fh.readlines()])
        m = re.search(
            "Compressing[ ]+([0-9A-Za-z_\-]+)\.\.\..*Exit code:[ ]+([0-9]+)",
            compress_str)
        if m:
            if not m.groups()[0] in flowcells.keys():
                LOG.warn("flowcell {} present in to_remove but not in archive".
                         format(m.groups()[0]))
            else:
                flowcells[m.groups()[0]]['pbzip_exit'] = m.groups()[1]
        else:
            LOG.warn("{}: no match for {}".format(f, compress_str))

    ## Get tarball sizes and check if in archive
    ## Loop through flowcells and perform ichksum
    for k in flowcells.keys():
        LOG.debug(
            "Getting tarball size, archive presence and ichksum for {}".format(
                k))
        fcdir = os.path.join(archive_dir, k)
        if os.path.exists(fcdir):
            flowcells[k]['in_archive'] = True
        fctar = os.path.join(swestore_dir, "drophere2archive",
                             "{}.tar.bz2".format(k))
        try:
            cl = ["ichksum", os.path.basename(fctar)]
            proc = subprocess.Popen(cl,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            (stdout, stderr) = proc.communicate()
            proc.wait()
            flowcells[k]['irods_checksum'] = stdout.split("\n")[1]
        except:
            LOG.warn("command {} failed".format(" ".join(cl)))
        if not os.path.exists(fctar):
            continue
        else:
            LOG.debug("tarball exists: {}".format(fctar))
            statinfo = os.stat(fctar)
            flowcells[k]['tarball_size'] = float(int(statinfo.st_size) / 1e9)

    output_data["stdout"].write("\nFlowcell archive status\n")
    output_data["stdout"].write("=======================\n")
    output_data["stdout"].write(
        "\nThe table lists those flowcells still present in archive. The exict code for pbzip should be 0\nfor success. A non-existing tarball has size 0.\n\n"
    )
    output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format(
        "Flowcell", "pbzip_exit", "tarball_size (G)", 'irods_checksum'))
    output_data["stdout"].write("{:<40}{:>12}{:>20}{:>60}\n".format(
        "--------", "----------", "----------------", '--------------'))

    for k in sorted(flowcells.keys()):
        if not flowcells[k]['in_archive']:
            continue
        output_data["stdout"].write("{:<40}{:>12}{:>20.2f}{:>60}\n".format(
            k, flowcells[k]['pbzip_exit'], flowcells[k]['tarball_size'],
            flowcells[k]['irods_checksum']))
    return output_data
Example #53
0
def setUpModule():
    """Set up test files for scilifelab pipeline tests. The setup
    covers some typical situations, such as multiplexing, samples run
    on several flowcells, and same sample being run on several lanes
    in one flowcell.

    In short, the setup
    - downloads data from 1000 genomes (exome data from chr11, 0-2Mb)
    - generates fastq files in an archive folder
    - installs genome references (phix, hg19)
    - downloads dbsnp data for chr11, 0-2Mb
    - runs run_bcbb_pipeline.py -s to install fastq files to production folder
    - runs automated_initial_analysis.py
    """
    pattern = "14_write_metrics.txt"
    def filter_fn(f):
        return re.search(pattern, f) != None

    n = sum([len(filtered_walk(os.path.join(PROJECTDIR, x), filter_fn)) for x in PROJECTS])
    if n == NSAMPLES:
        LOG.info("All samples have been run, requirements for downstream tests satisfied")
        return
    LOG.info("Running setUpModule")
    _check_requirements()
    ## Add function to check existence of output files
    _install_1000g_test_files(os.path.join(os.path.dirname(__file__), "data", "production"))
    _install_phix()
    dbsnp = _install_dbsnp_entrez()
    (omni_out, hapmap_out, mills_out) = _install_training_data()

    _download_ucsc_genome_and_index()
    ## Install post_process file
    fh = open(POSTPROCESS, "w")
    fh.write(PPTEMPLATE.render(**{'store_dir':ARCHIVE, 'base_dir':PRODUCTION, 'dbsnp':dbsnp, 'omni':omni_out, 'hapmap':hapmap_out, 'mills':mills_out}))
    fh.close()
    ## Install index files
    for k, v in index_files.iteritems():
        if not os.path.exists(os.path.dirname(v['file'])):
            safe_makedir(os.path.dirname(v['file']))
        fh = open(v['file'], "w")
        fh.write(v['data'].getvalue())
        fh.close()
    ## Make production dir
    if not os.path.exists(PRODUCTION):
        safe_makedir(PRODUCTION)

    ## Install files in production with run_bcbb_pipeline.py
    for k in FLOWCELL.keys():
        install = False
        for ss in SAMPLESHEETS[k].split("\n"):
            vals = ss.split(",")
            if vals[0]=="FCID":
                continue
            outdir = os.path.join(PRODUCTION, "{}".format(vals[5].replace("__", ".")), "{}".format(vals[2]), "{}_{}".format(FLOWCELL[k].split("_")[0],FLOWCELL[k].split("_")[-1]))
            r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1]))
            r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1]))
            LOG.info("Looking for {} and {}".format(r1, r2))
            if not os.path.exists(r1) or not os.path.exists(r2):
                install = True
                break
        if install:
            LOG.info("Installing files with run_bcbb_pipeline.py for flowcell {}".format(k))
            cl = ["run_bcbb_pipeline.py", "-s", "-g", POSTPROCESS, os.path.join(ARCHIVE, FLOWCELL[k])]
            subprocess.check_call(cl)
        else:
            LOG.info("All files present; not running run_bcbb_pipeline.py")
    
    ## Run pipeline on samples 
    pattern = "-bcbb-config.yaml$"
    yamlfiles = []
    ## http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
    ## [item for sublist in l for item in sublist]
    yamlfiles = [item for sublist in [filtered_walk(os.path.join(PROJECTDIR, x), filter_fn) for x in PROJECTS] for item in sublist]
    orig_dir = os.path.abspath(os.curdir)
    for yamlconfig in yamlfiles:
        try:
            LOG.info("cding to {}".format(os.path.abspath(os.curdir)))
            os.chdir(os.path.dirname(yamlconfig))
            LOG.info("cding to {}".format(os.path.dirname(yamlconfig)))
            cl = ["automated_initial_analysis.py", POSTPROCESS, os.path.join(os.path.pardir, os.path.basename(os.path.dirname(yamlconfig))), yamlconfig]
            if not os.path.exists(os.path.join(os.path.dirname(yamlconfig), "14_write_metrics.txt")):
                LOG.info("Running pipeline: {}".format(" ".join(cl)))
                subprocess.check_call(cl)
        finally:
            os.chdir(orig_dir)
            LOG.info("Finished pipeline run and cd back to {}".format(orig_dir))