Example #1
0
def setup_merged_samples(flist, sample_group_fn=_group_samples, **kw):
    """Setup analysis that merges multiple sample runs.

    :param flist: list of file names, by default *-bcbb-config.yaml files
    :param sample_group_fn: function that groups files into samples and sample runs. The function takes flist as input.

    :returns: updated flist with config files for merged samples
    """
    new_flist = []
    sample_d = sample_group_fn(flist)
    for k, v in sample_d.iteritems():
        if len(v) > 1:
            f = v[v.keys()[0]]
            out_d = os.path.join(os.path.dirname(os.path.dirname(f)), MERGED_SAMPLE_OUTPUT_DIR)
            LOG.info("Sample {} has {} sample runs; setting up merge analysis in {}".format(k, len(v), out_d))
            dry_makedir(out_d, dry_run=False)
            pp = kw.get("post_process") if kw.get("post_process", None) else f.replace("-bcbb-config.yaml", "-post_process.yaml")
            with open(pp) as fh:
                conf = yaml.load(fh)
            conf = update_pp_platform_args(conf, **{'jobname': "{}_total".format(k), 'workdir': out_d, 'output': "{}_total-bcbb.log".format(k) })
            pp_new = os.path.join(out_d, os.path.basename(pp))
            dry_unlink(pp_new, dry_run=kw.get('dry_run', True))
            dry_write(pp_new, yaml.safe_dump(conf, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True))
            # Setup merged bcbb-config file
            bcbb_config = merge_sample_config(v.values(), sample=k, out_d=out_d, dry_run=kw.get('dry_run', True))
            bcbb_config_file = os.path.join(out_d, os.path.basename(v.values()[0]))
            bcbb_config = sort_sample_config_fastq(bcbb_config)
            if not os.path.exists(bcbb_config_file) or kw.get('new_config', False):
                dry_unlink(bcbb_config_file, dry_run=kw.get('dry_run', True))
                dry_write(bcbb_config_file, yaml.safe_dump(bcbb_config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True))
            ##new_flist.extend(v.values())
            new_flist.extend([bcbb_config_file])
    return new_flist
Example #2
0
def setup_merged_samples(flist, sample_group_fn=_group_samples, **kw):
    """Setup analysis that merges multiple sample runs.

    :param flist: list of file names, by default *-bcbb-config.yaml files
    :param sample_group_fn: function that groups files into samples and sample runs. The function takes flist as input.

    :returns: updated flist with config files for merged samples
    """
    new_flist = []
    sample_d = sample_group_fn(flist)
    for k, v in sample_d.iteritems():
        if len(v):
            f = v[v.keys()[0]]
            out_d = os.path.join(os.path.dirname(os.path.dirname(f)), MERGED_SAMPLE_OUTPUT_DIR)
            LOG.info("Sample {} has {} sample runs; setting up merge analysis in {}".format(k, len(v), out_d))
            dry_makedir(out_d, dry_run=False)
            pp = kw.get("post_process",f.replace("-bcbb-config.yaml", "-post_process.yaml"))
            with open(pp) as fh:
                conf = yaml.load(fh)
            conf = update_pp_platform_args(conf, **{'jobname': "{}_total".format(k), 'workdir': out_d, 'output': "{}_total-bcbb.log".format(k) })
            pp_new = os.path.join(out_d, os.path.basename(pp))
            dry_unlink(pp_new, dry_run=kw.get('dry_run', True))
            dry_write(pp_new, yaml.safe_dump(conf, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True))
            # Setup merged bcbb-config file
            bcbb_config = merge_sample_config(v.values(), sample=k, out_d=out_d, dry_run=kw.get('dry_run', True))
            bcbb_config_file = os.path.join(out_d, os.path.basename(v.values()[0]))
            bcbb_config = sort_sample_config_fastq(bcbb_config, path=out_d)
            if not os.path.exists(bcbb_config_file) or kw.get('new_config', False):
                dry_unlink(bcbb_config_file, dry_run=kw.get('dry_run', True))
                dry_write(bcbb_config_file, yaml.safe_dump(bcbb_config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True))
            ##new_flist.extend(v.values())
            new_flist.extend([bcbb_config_file])
    return new_flist
Example #3
0
def _purge_by_sample(files, dry_run, fsize=MINFILESIZE):
    saved_size = 0
    for i in range(0, len(files)-1):
        f1 = os.path.basename(files[i])
        f2 = os.path.basename(files[i+1])
        if f2.startswith(os.path.splitext(f1)[0]):
            statinfo = os.stat(files[i])
            if statinfo.st_size < fsize:
                continue
            saved_size = saved_size + statinfo.st_size
            LOG.info("Purging bam file {}".format(files[i])) 
            dry_unlink(files[i], dry_run)
            dry_write(files[i], "File removed to save disk space: Moved to {}".format(files[i+1]), dry_run)
    return saved_size
Example #4
0
def remove_files(f, **kw):
    ## Remove old files if requested
    keep_files = [
        "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$",
        "-bcbb-config.yaml.bak$", "-bcbb-command.txt$",
        "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$",
        "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$", "^[0-9][0-9]_.*.txt$",
        "JOBID", "PID"
    ]
    pattern = "|".join(keep_files)

    def remove_filter_fn(f):
        return re.search(pattern, f) == None

    workdir = os.path.dirname(f)
    remove_files = filtered_walk(workdir, remove_filter_fn)
    remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
    if len(remove_files) == 0:
        pass
    if len(remove_files) > 0 and query_yes_no(
            "Going to remove {} files and {} directories... Are you sure you want to continue?"
            .format(len(remove_files), len(remove_dirs)),
            force=kw['force']):
        [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files]
        ## Sort directories by length so we don't accidentally try to remove a non-empty dir
        [
            dry_rmdir(x, dry_run=kw['dry_run'])
            for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)
        ]
Example #5
0
def _purge_by_sample(files, dry_run, fsize=MINFILESIZE):
    saved_size = 0
    for i in range(0, len(files) - 1):
        f1 = os.path.basename(files[i])
        f2 = os.path.basename(files[i + 1])
        if f2.startswith(os.path.splitext(f1)[0]):
            statinfo = os.stat(files[i])
            if statinfo.st_size < fsize:
                continue
            saved_size = saved_size + statinfo.st_size
            LOG.info("Purging bam file {}".format(files[i]))
            dry_unlink(files[i], dry_run)
            dry_write(
                files[i],
                "File removed to save disk space: Moved to {}".format(
                    files[i + 1]), dry_run)
    return saved_size
Example #6
0
def remove_files(f, **kw):
    ## Remove old files if requested
    keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$",  "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "_[0-9]+_fastq.txt.gz$", "_[0-9]+_fastq.txt$",
                  "^[0-9][0-9]_.*.txt$", "JOBID", "PID"]
    pattern = "|".join(keep_files)
    def remove_filter_fn(f):
        return re.search(pattern, f) == None

    workdir = os.path.dirname(f)
    remove_files = filtered_walk(workdir, remove_filter_fn)
    remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True)
    if len(remove_files) == 0:
        pass
    if len(remove_files) > 0 and query_yes_no("Going to remove {} files and {} directories... Are you sure you want to continue?".format(len(remove_files), len(remove_dirs)), force=kw['force']):
        [dry_unlink(x, dry_run=kw['dry_run']) for x in remove_files]
        ## Sort directories by length so we don't accidentally try to remove a non-empty dir
        [dry_rmdir(x, dry_run=kw['dry_run']) for x in sorted(remove_dirs, key=lambda x: len(x), reverse=True)]
Example #7
0
def purge_alignments(path,
                     ftype="sam",
                     keep="last",
                     dry_run=False,
                     force=False,
                     fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug(
        "running purge_alignments in path {} with pattern {} keep rule {}".
        format(path, pattern, keep))

    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None

    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no(
            "Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?"
            .format(len(flist), ftype, ",".join(
                [os.path.basename(x) for x in flist[0:10]])),
            force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(
                    f, "File removed to save disk space: SAM converted to BAM",
                    dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]

            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f)
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x, y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info(
                        "Keeping file {} and removing all files with common prefix: {}"
                        .format(
                            os.path.basename(files[len(files) - 1]), ", ".join(
                                [os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run,
                                              int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size /
                                                                1e9))
Example #8
0
def setup_sample(f, analysis, amplicon=False, genome_build="hg19", **kw):
    """Setup config files, making backups and writing new files

    :param path: root path in which to search for samples
    :param dry_run: dry run flag
    """
    if not os.path.exists(f):
        return
    with open(f) as fh:
        config = yaml.load(fh)
    ## Check for correctly formatted config
    if not config.get("details", None):
        LOG.warn("Couldn't find 'details' section in config file {}: aborting setup!".format(f))
        return

    ## Save file to backup if backup doesn't exist
    f_bak = f.replace("-bcbb-config.yaml", "-bcbb-config.yaml.bak")
    if not os.path.exists(f_bak):
        LOG.info("Making backup of {} in {}".format(f, f_bak))
        dry_backup(os.path.abspath(f), dry_run=kw['dry_run'])

    ## Save command file to backup if it doesn't exist
    cmdf = f.replace("-bcbb-config.yaml", "-bcbb-command.txt")
    if os.path.exists(cmdf):
        cmdf_bak = cmdf.replace("-bcbb-command.txt", "-bcbb-command.txt.bak")
        if not os.path.exists(cmdf_bak):
            LOG.info("Making backup of {} in {}".format(cmdf, cmdf_bak))
            dry_backup(os.path.abspath(cmdf), dry_run=kw['dry_run'])

    ## Save post_process file to backup if it doesn't exist
    ppf = f.replace("-bcbb-config.yaml", "-post_process.yaml")
    if os.path.exists(ppf):
        ppf_bak = ppf.replace("-post_process.yaml", "-post_process.yaml.bak")
        if not os.path.exists(ppf_bak):
            LOG.info("Making backup of {} in {}".format(ppf, ppf_bak))
            dry_backup(ppf, dry_run=kw['dry_run'])

    if analysis:
        config = update_sample_config(config, "analysis", analysis)
    if genome_build:
        config = update_sample_config(config, "genome_build", genome_build)
    config = sort_sample_config_fastq(config)

    ## Remove config file and rewrite
    dry_unlink(f, kw['dry_run'])
    dry_write(f, yaml.safe_dump(config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])

    ## Setup post process only if not provided at command line
    if not kw.get("post_process", None):
        ppfile = f.replace("-bcbb-config.yaml", "-post_process.yaml")
        with open(ppfile) as fh:
            pp = yaml.load(fh)
        ## Need to set working directory to path of bcbb-config.yaml file
        if pp.get('distributed', {}).get('platform_args', None):
            platform_args = pp['distributed']['platform_args'].split()
            if "-D" in platform_args:
                platform_args[platform_args.index("-D")+1] = os.path.dirname(f)
            elif "--workdir" in platform_args:
                platform_args[platform_args.index("--workdir")+1] = os.path.dirname(f)
            pp['distributed']['platform_args'] = " ".join(platform_args)
        ## Change keys for all analyses
        for anl in pp.get('custom_algorithms',{}).keys():
            if kw.get('baits', None):
                pp['custom_algorithms'][anl]['hybrid_bait'] = kw['baits']
            if kw.get('targets', None):
                pp['custom_algorithms'][anl]['hybrid_target'] = kw['targets']
            if amplicon:
                pp['custom_algorithms'][anl]['mark_duplicates'] = False
        if amplicon:
            LOG.info("setting amplicon analysis")
            pp['algorithm']['mark_duplicates'] = False
        if kw.get('galaxy_config', None):
            pp['galaxy_config'] = kw['galaxy_config']
        if kw.get('distributed', None):
            LOG.info("setting distributed execution")
            pp['algorithm']['num_cores'] = 'messaging'
        else:
            LOG.info("setting parallell execution")
            pp['algorithm']['num_cores'] = kw['num_cores']
        if kw.get('snpEff', None):
            LOG.info("setting snpEff to {}".format(kw["snpEff"]))
            pp['program']['snpEff'] = kw['snpEff']
        dry_unlink(ppfile, dry_run=kw['dry_run'])
        dry_write(ppfile, yaml.safe_dump(pp, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])
Example #9
0
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep))
    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None
    
    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]
            
            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f) 
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files  in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x,y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
Example #10
0
def setup_sample(f, analysis, amplicon=False, genome_build="hg19", **kw):
    """Setup config files, making backups and writing new files

    :param path: root path in which to search for samples
    :param dry_run: dry run flag
    """
    if not os.path.exists(f):
        return
    with open(f) as fh:
        config = yaml.load(fh)
    ## Check for correctly formatted config
    if not config.get("details", None):
        LOG.warn("Couldn't find 'details' section in config file {}: aborting setup!".format(f))
        return

    ## Save file to backup if backup doesn't exist
    f_bak = f.replace("-bcbb-config.yaml", "-bcbb-config.yaml.bak")
    if not os.path.exists(f_bak):
        LOG.info("Making backup of {} in {}".format(f, f_bak))
        dry_backup(os.path.abspath(f), dry_run=kw['dry_run'])

    ## Save command file to backup if it doesn't exist
    cmdf = f.replace("-bcbb-config.yaml", "-bcbb-command.txt")
    if os.path.exists(cmdf):
        cmdf_bak = cmdf.replace("-bcbb-command.txt", "-bcbb-command.txt.bak")
        if not os.path.exists(cmdf_bak):
            LOG.info("Making backup of {} in {}".format(cmdf, cmdf_bak))
            dry_backup(os.path.abspath(cmdf), dry_run=kw['dry_run'])

    ## Save post_process file to backup if it doesn't exist
    ppf = f.replace("-bcbb-config.yaml", "-post_process.yaml")
    if os.path.exists(ppf):
        ppf_bak = ppf.replace("-post_process.yaml", "-post_process.yaml.bak")
        if not os.path.exists(ppf_bak):
            LOG.info("Making backup of {} in {}".format(ppf, ppf_bak))
            dry_backup(ppf, dry_run=kw['dry_run'])

    if analysis:
        config = update_sample_config(config, "analysis", analysis)
    if genome_build:
        config = update_sample_config(config, "genome_build", genome_build)
    config = sort_sample_config_fastq(config)

    ## Remove config file and rewrite
    dry_unlink(f, kw['dry_run'])
    dry_write(f, yaml.safe_dump(config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])

    ## Setup post process only if not provided at command line
    if not kw.get("post_process", None):
        ppfile = f.replace("-bcbb-config.yaml", "-post_process.yaml")
        with open(ppfile) as fh:
            pp = yaml.load(fh)
        ## Need to set working directory to path of bcbb-config.yaml file
        if pp.get('distributed', {}).get('platform_args', None):
            platform_args = pp['distributed']['platform_args'].split()
            if "-D" in platform_args:
                platform_args[platform_args.index("-D")+1] = os.path.dirname(f)
            elif "--workdir" in platform_args:
                platform_args[platform_args.index("--workdir")+1] = os.path.dirname(f)
            pp['distributed']['platform_args'] = " ".join(platform_args)
        ## Change keys for all analyses
        for anl in pp.get('custom_algorithms',{}).keys():
            if kw.get('baits', None):
                pp['custom_algorithms'][anl]['hybrid_bait'] = kw['baits']
            if kw.get('targets', None):
                pp['custom_algorithms'][anl]['hybrid_target'] = kw['targets']
            if amplicon:
                pp['custom_algorithms'][anl]['mark_duplicates'] = False
        if amplicon:
            LOG.info("setting amplicon analysis")
            pp['algorithm']['mark_duplicates'] = False
        if kw.get('galaxy_config', None):
            pp['galaxy_config'] = kw['galaxy_config']
        if kw.get('distributed', None):
            LOG.info("setting distributed execution")
            pp['algorithm']['num_cores'] = 'messaging'
        elif kw.get('num_cores', None):
            LOG.info("setting parallell execution")
            pp['algorithm']['num_cores'] = kw['num_cores']
        if kw.get('snpEff', None):
            LOG.info("setting snpEff to {}".format(kw["snpEff"]))
            pp['program']['snpEff'] = kw['snpEff']
        dry_unlink(ppfile, dry_run=kw['dry_run'])
        dry_write(ppfile, yaml.safe_dump(pp, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])