Esempio n. 1
0
def setup_merged_samples(flist, sample_group_fn=_group_samples, **kw):
    """Setup analysis that merges multiple sample runs.

    :param flist: list of file names, by default *-bcbb-config.yaml files
    :param sample_group_fn: function that groups files into samples and sample runs. The function takes flist as input.

    :returns: updated flist with config files for merged samples
    """
    new_flist = []
    sample_d = sample_group_fn(flist)
    for k, v in sample_d.iteritems():
        if len(v) > 1:
            f = v[v.keys()[0]]
            out_d = os.path.join(os.path.dirname(os.path.dirname(f)), MERGED_SAMPLE_OUTPUT_DIR)
            LOG.info("Sample {} has {} sample runs; setting up merge analysis in {}".format(k, len(v), out_d))
            dry_makedir(out_d, dry_run=False)
            pp = kw.get("post_process") if kw.get("post_process", None) else f.replace("-bcbb-config.yaml", "-post_process.yaml")
            with open(pp) as fh:
                conf = yaml.load(fh)
            conf = update_pp_platform_args(conf, **{'jobname': "{}_total".format(k), 'workdir': out_d, 'output': "{}_total-bcbb.log".format(k) })
            pp_new = os.path.join(out_d, os.path.basename(pp))
            dry_unlink(pp_new, dry_run=kw.get('dry_run', True))
            dry_write(pp_new, yaml.safe_dump(conf, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True))
            # Setup merged bcbb-config file
            bcbb_config = merge_sample_config(v.values(), sample=k, out_d=out_d, dry_run=kw.get('dry_run', True))
            bcbb_config_file = os.path.join(out_d, os.path.basename(v.values()[0]))
            bcbb_config = sort_sample_config_fastq(bcbb_config)
            if not os.path.exists(bcbb_config_file) or kw.get('new_config', False):
                dry_unlink(bcbb_config_file, dry_run=kw.get('dry_run', True))
                dry_write(bcbb_config_file, yaml.safe_dump(bcbb_config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw.get('dry_run', True))
            ##new_flist.extend(v.values())
            new_flist.extend([bcbb_config_file])
    return new_flist
Esempio n. 2
0
def _purge_by_sample(files, dry_run, fsize=MINFILESIZE):
    saved_size = 0
    for i in range(0, len(files)-1):
        f1 = os.path.basename(files[i])
        f2 = os.path.basename(files[i+1])
        if f2.startswith(os.path.splitext(f1)[0]):
            statinfo = os.stat(files[i])
            if statinfo.st_size < fsize:
                continue
            saved_size = saved_size + statinfo.st_size
            LOG.info("Purging bam file {}".format(files[i])) 
            dry_unlink(files[i], dry_run)
            dry_write(files[i], "File removed to save disk space: Moved to {}".format(files[i+1]), dry_run)
    return saved_size
Esempio n. 3
0
def run_halo(path=None, project=None, batch_size=8, **kw):
    """Run halo application. Setup parameter files and call
    halo_pipeline.sh script.

    :param project: project name
    :param batch_size: number of samples to run in each project config file
    """
    plist = sorted(find_samples(path, **kw))
    plist_chunks=[plist[x:x+batch_size] for x in xrange(0, len(plist), batch_size)]
    i = 0
    param_list = []
    for pl in plist_chunks:
        i += 1
        outfile = os.path.join(path, "{}_{}_halo.projectrc".format(project, i))
        param = {'cl':None, 'platform_args':None, 'workingDirectory':None}
        label = '{}_halo_{}'.format(project[0:3].replace(".", "_"), i)
        d = {'samples' : '"{}"'.format(" ".join([os.path.basename(x) for x in pl])),
             'indir' : path,
             'baits_file' : kw.get('baits', ""),
             'targets_file' : kw.get('targets', ""),
             'target_region' : kw.get('target_region', ""),
             'output' : os.path.join(os.path.dirname(outfile), "{}.out".format(label)),
             'error' : os.path.join(os.path.dirname(outfile), "{}.err".format(label))
             }
        if kw.get("setup", False):
            dry_write(outfile, PROJECTTEMPLATE.render(**d), dry_run=kw.get("dry_run", False))
        if not os.path.exists(outfile):
            LOG.warn("No such configuration file {}; rerun command with '--setup' option")
            return []
        if kw.get("config", None) and os.path.basename(outfile) != kw.get("config", None):
            continue
        param['cl'] = [HALOSCRIPT, "-c", HALORC, outfile]
        param['platform_args'] = ['--output', os.path.join("{}.out".format(label)),
                                  '--error', os.path.join("{}.err".format(label)),
                                  '--job-name', label]
        param['workingDirectory'] = os.path.dirname(outfile)
        param_list.append(param)
    return param_list
Esempio n. 4
0
def setup_sample(f, analysis, amplicon=False, genome_build="hg19", **kw):
    """Setup config files, making backups and writing new files

    :param path: root path in which to search for samples
    :param dry_run: dry run flag
    """
    if not os.path.exists(f):
        return
    with open(f) as fh:
        config = yaml.load(fh)
    ## Check for correctly formatted config
    if not config.get("details", None):
        LOG.warn("Couldn't find 'details' section in config file {}: aborting setup!".format(f))
        return

    ## Save file to backup if backup doesn't exist
    f_bak = f.replace("-bcbb-config.yaml", "-bcbb-config.yaml.bak")
    if not os.path.exists(f_bak):
        LOG.info("Making backup of {} in {}".format(f, f_bak))
        dry_backup(os.path.abspath(f), dry_run=kw['dry_run'])

    ## Save command file to backup if it doesn't exist
    cmdf = f.replace("-bcbb-config.yaml", "-bcbb-command.txt")
    if os.path.exists(cmdf):
        cmdf_bak = cmdf.replace("-bcbb-command.txt", "-bcbb-command.txt.bak")
        if not os.path.exists(cmdf_bak):
            LOG.info("Making backup of {} in {}".format(cmdf, cmdf_bak))
            dry_backup(os.path.abspath(cmdf), dry_run=kw['dry_run'])

    ## Save post_process file to backup if it doesn't exist
    ppf = f.replace("-bcbb-config.yaml", "-post_process.yaml")
    if os.path.exists(ppf):
        ppf_bak = ppf.replace("-post_process.yaml", "-post_process.yaml.bak")
        if not os.path.exists(ppf_bak):
            LOG.info("Making backup of {} in {}".format(ppf, ppf_bak))
            dry_backup(ppf, dry_run=kw['dry_run'])

    if analysis:
        config = update_sample_config(config, "analysis", analysis)
    if genome_build:
        config = update_sample_config(config, "genome_build", genome_build)
    config = sort_sample_config_fastq(config)

    ## Remove config file and rewrite
    dry_unlink(f, kw['dry_run'])
    dry_write(f, yaml.safe_dump(config, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])

    ## Setup post process only if not provided at command line
    if not kw.get("post_process", None):
        ppfile = f.replace("-bcbb-config.yaml", "-post_process.yaml")
        with open(ppfile) as fh:
            pp = yaml.load(fh)
        ## Need to set working directory to path of bcbb-config.yaml file
        if pp.get('distributed', {}).get('platform_args', None):
            platform_args = pp['distributed']['platform_args'].split()
            if "-D" in platform_args:
                platform_args[platform_args.index("-D")+1] = os.path.dirname(f)
            elif "--workdir" in platform_args:
                platform_args[platform_args.index("--workdir")+1] = os.path.dirname(f)
            pp['distributed']['platform_args'] = " ".join(platform_args)
        ## Change keys for all analyses
        for anl in pp.get('custom_algorithms',{}).keys():
            if kw.get('baits', None):
                pp['custom_algorithms'][anl]['hybrid_bait'] = kw['baits']
            if kw.get('targets', None):
                pp['custom_algorithms'][anl]['hybrid_target'] = kw['targets']
            if amplicon:
                pp['custom_algorithms'][anl]['mark_duplicates'] = False
        if amplicon:
            LOG.info("setting amplicon analysis")
            pp['algorithm']['mark_duplicates'] = False
        if kw.get('galaxy_config', None):
            pp['galaxy_config'] = kw['galaxy_config']
        if kw.get('distributed', None):
            LOG.info("setting distributed execution")
            pp['algorithm']['num_cores'] = 'messaging'
        else:
            LOG.info("setting parallell execution")
            pp['algorithm']['num_cores'] = kw['num_cores']
        if kw.get('snpEff', None):
            LOG.info("setting snpEff to {}".format(kw["snpEff"]))
            pp['program']['snpEff'] = kw['snpEff']
        dry_unlink(ppfile, dry_run=kw['dry_run'])
        dry_write(ppfile, yaml.safe_dump(pp, default_flow_style=False, allow_unicode=True, width=1000), dry_run=kw['dry_run'])
Esempio n. 5
0
def purge_alignments(path, ftype="sam", keep="last", dry_run=False, force=False, fsize=MINFILESIZE):
    """Cleanup sam and bam files. In some cases, sam files persist. If
    the corresponding bam file exists, replace the sam file contents
    with a message that the file has been removed to save space.
    
    In general, several bam files are produced in an analysis. By
    grouping bam files by prefix, either the most recent file is
    retained for further reference, or a specific analysis is kept.
    """
    if ftype == "sam":
        pattern = ".sam$"
    elif ftype == "bam":
        pattern = ".bam$"
    else:
        LOG.warn("ftype must be one of 'sam' or 'bam'")
        return
    LOG.debug("running purge_alignments in path {} with pattern {} keep rule {}".format(path, pattern, keep))
    def purge_filter(f):
        if not pattern:
            return
        return re.search(pattern, f) != None
    
    flist = filtered_walk(path, purge_filter, exclude_dirs=["realign-split"])
    if len(flist) == 0:
        LOG.info("No {} files found in {}".format(ftype, path))
        return
    if len(flist) > 0 and not query_yes_no("Going to remove/cleanup {} {} files ({}...). Are you sure you want to continue?".format(len(flist), ftype, ",".join([os.path.basename(x) for x in flist[0:10]])), force=force):
        return
    if ftype == "sam":
        for f in flist:
            LOG.info("Purging {} file {}".format(ftype, f))
            dry_unlink(f, dry_run)
            if os.path.exists(f.replace(".sam", ".bam")):
                dry_write(f, "File removed to save disk space: SAM converted to BAM", dry_run)
        return
    elif ftype == "bam":
        samples = {}
        for f in flist:
            m = re.search("([0-9A-Za-z\_]+)-.*", os.path.basename(f))
            if not m:
                LOG.debug("Couldn't determine prefix for {}".format(f))
                continue
            sid = m.groups()[0]
            
            if not sid in samples.keys():
                samples[sid] = {}
            dname = os.path.dirname(f) 
            if not dname in samples[sid].keys():
                samples[sid][dname] = []
            samples[sid][dname].append(f)

        saved_size = 0
        for k in samples.iterkeys():
            for d, files  in samples[k].iteritems():
                if not files or len(files) == 1:
                    continue
                files.sort(lambda x,y: cmp(len(x), len(y)))
                if keep == "last":
                    LOG.info("Keeping file {} and removing all files with common prefix: {}".format(os.path.basename(files[len(files)-1]), ", ".join([os.path.basename(x) for x in files[0:-1]])))
                saved_size = _purge_by_sample(files, dry_run, int(fsize)) + saved_size
        LOG.info("Will save approximately {:.1f}G space".format(saved_size / 1e9))
Esempio n. 6
0
def setup_sample(f, analysis, amplicon=False, genome_build="hg19", **kw):
    """Setup config files, making backups and writing new files

    :param path: root path in which to search for samples
    :param dry_run: dry run flag
    """
    if not os.path.exists(f):
        return
    with open(f) as fh:
        config = yaml.load(fh)
    ## Check for correctly formatted config
    if not config.get("details", None):
        LOG.warn(
            "Couldn't find 'details' section in config file {}: aborting setup!"
            .format(f))
        return

    ## Save file to backup if backup doesn't exist
    f_bak = f.replace("-bcbb-config.yaml", "-bcbb-config.yaml.bak")
    if not os.path.exists(f_bak):
        LOG.info("Making backup of {} in {}".format(f, f_bak))
        dry_backup(os.path.abspath(f), dry_run=kw['dry_run'])

    ## Save command file to backup if it doesn't exist
    cmdf = f.replace("-bcbb-config.yaml", "-bcbb-command.txt")
    if os.path.exists(cmdf):
        cmdf_bak = cmdf.replace("-bcbb-command.txt", "-bcbb-command.txt.bak")
        if not os.path.exists(cmdf_bak):
            LOG.info("Making backup of {} in {}".format(cmdf, cmdf_bak))
            dry_backup(os.path.abspath(cmdf), dry_run=kw['dry_run'])

    ## Save post_process file to backup if it doesn't exist
    ppf = f.replace("-bcbb-config.yaml", "-post_process.yaml")
    if os.path.exists(ppf):
        ppf_bak = ppf.replace("-post_process.yaml", "-post_process.yaml.bak")
        if not os.path.exists(ppf_bak):
            LOG.info("Making backup of {} in {}".format(ppf, ppf_bak))
            dry_backup(ppf, dry_run=kw['dry_run'])

    if analysis:
        config = update_sample_config(config, "analysis", analysis)
    if genome_build:
        config = update_sample_config(config, "genome_build", genome_build)
    config = sort_sample_config_fastq(config)

    ## Remove config file and rewrite
    dry_unlink(f, kw['dry_run'])
    dry_write(f,
              yaml.safe_dump(config,
                             default_flow_style=False,
                             allow_unicode=True,
                             width=1000),
              dry_run=kw['dry_run'])

    ## Setup post process only if not provided at command line
    if not kw.get("post_process", None):
        ppfile = f.replace("-bcbb-config.yaml", "-post_process.yaml")
        with open(ppfile) as fh:
            pp = yaml.load(fh)
        ## Need to set working directory to path of bcbb-config.yaml file
        if pp.get('distributed', {}).get('platform_args', None):
            platform_args = pp['distributed']['platform_args'].split()
            if "-D" in platform_args:
                platform_args[platform_args.index("-D") +
                              1] = os.path.dirname(f)
            elif "--workdir" in platform_args:
                platform_args[platform_args.index("--workdir") +
                              1] = os.path.dirname(f)
            pp['distributed']['platform_args'] = " ".join(platform_args)
        ## Change keys for all analyses
        for anl in pp.get('custom_algorithms', {}).keys():
            if kw.get('baits', None):
                pp['custom_algorithms'][anl]['hybrid_bait'] = kw['baits']
            if kw.get('targets', None):
                pp['custom_algorithms'][anl]['hybrid_target'] = kw['targets']
            if amplicon:
                pp['custom_algorithms'][anl]['mark_duplicates'] = False
        if amplicon:
            LOG.info("setting amplicon analysis")
            pp['algorithm']['mark_duplicates'] = False
        if kw.get('galaxy_config', None):
            pp['galaxy_config'] = kw['galaxy_config']
        if kw.get('distributed', None):
            LOG.info("setting distributed execution")
            pp['algorithm']['num_cores'] = 'messaging'
        elif kw.get('num_cores', None):
            LOG.info("setting parallell execution")
            pp['algorithm']['num_cores'] = kw['num_cores']
        if kw.get('snpEff', None):
            LOG.info("setting snpEff to {}".format(kw["snpEff"]))
            pp['program']['snpEff'] = kw['snpEff']
        dry_unlink(ppfile, dry_run=kw['dry_run'])
        dry_write(ppfile,
                  yaml.safe_dump(pp,
                                 default_flow_style=False,
                                 allow_unicode=True,
                                 width=1000),
                  dry_run=kw['dry_run'])