Esempio n. 1
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('--sample-bam-map', required=True,
                        help="Yaml file listing BAM file input (value)"
                        " per sample (key; reused for output filenames here)")

    args = parser.parse_args()

    # FIXME how to remove the arguments froma argparser in the first place?
    assert not args.sample_cfg, ("Usual sample config not supported. Replaced in this pipeline with --sample-bam-map")

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)


    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = dict()
    cfg_dict['samples'] = dict()

    with open(args.sample_bam_map) as fh:
        sample_bam_map = dict(yaml.safe_load(fh))
    for sample, bam in sample_bam_map.items():
        assert os.path.exists(bam)
        # if we have relative paths, make them abs relative to cfgfile
        if not os.path.isabs(bam):
            bam = os.path.abspath(os.path.join(os.path.dirname(args.sample_bam_map), bam))
            sample_bam_map[sample] = bam
    cfg_dict['sample_bam_map'] = sample_bam_map

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args, cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 2
0
def LDRefine(args):
	logger.info("Preparing LD-based genotype refinement pipeline...")
	print_parameters_given(args)

	assert os.path.exists("varCall"), "Cannot detect the directory of varaiant detection.\nWEScall varCall has to be run before LD-based genotype refinement."

	assert args.num_record_per_file > 0, "Number of records per file has to be larger than 1!"

	assert args.num_overlap_record >= 0, "Number of overlapping records has to be larger than 1!"

	assert args.num_record_per_file>args.num_overlap_record, "Number of records per file has to be larger than the number of overlapping records."

	if not os.path.exists("LDRefine"):
		os.mkdir("LDRefine")

	LDRefine_cfg=dict()
	LDRefine_cfg["num_record_per_file"]=args.num_record_per_file
	LDRefine_cfg["num_overlap_record"]=args.num_overlap_record

	PIPELINE_BASEDIR = os.path.join(os.path.dirname(sys.argv[0]))
	CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg")

	path_cluster_cfg=os.path.join(PIPELINE_BASEDIR,"cfg","cluster.LDRefine.yaml")

	# has to merge cluster
	with open(path_cluster_cfg, 'r') as fh:
		cluster_cfg = yaml.safe_load(fh)

	# turn arguments into user_data that gets merged into pipeline config
	#
	# generic data first
	user_data = dict()
	user_data['cluster'] = cluster_cfg
	user_data['LDRefine'] = LDRefine_cfg

	pipeline_handler = PipelineHandler(
		"WEScall_LDRefine", PIPELINE_BASEDIR, 
		"LDRefine",user_data,
		Snakefile="pipelines/LDRefine/Snakefile.beagle."+get_seq_type_from_user_cfg(args.userCfg),
		cluster_cfgfile=path_cluster_cfg,
		user_cfgfile=args.userCfg)

	pipeline_handler.setup_env()
	pipeline_handler.submit(no_run=True)
Esempio n. 3
0
def varCall(args):
	logger.info("Preparing varCall pipeline...")
	print_parameters_given(args)

	logger.info("Validating sample index ...")
	validate_sample_list_file(args)

	logger.info("Validating user config file ...")
	validate_user_cfg(args)

	logger.info("Checking existence of essenstial resource files...")
	check_resource_files_for_varCall()

	logger.info("Checking dependencies...")
	check_dependencies()

	pipeline_handler = PipelineHandler(
		"WEScall_varCall",
		PIPELINE_BASEDIR,
		Snakefile="pipelines/varCall/Snakefile."+get_seq_type_from_user_cfg(args.userCfg),
		outdir="./varCall",
		user_data="",
		user_cfgfile=args.userCfg,
		cluster_cfgfile=CFG_DIR+"/cluster.varCall.yaml"
		)


	os.system("mkdir -p ./varCall/data")
	shutil.copy2(args.sample_list,"./varCall/data/samples.index")

	# automatically generate the pedigree file for the user
	# Since WEScall does not utilize pedigree information, the pedigree file
	# is just a formality so as to let the pipeline run
	with open(args.sample_list) as f_in, open("./varCall/data/samples.ped","w") as f_out:
		for line in f_in:
			record = line.strip().split("\t")
			f_out.write("{smp}\t{smp}\t{smp}\t0\t0\n".format(smp=record[0]))

	pipeline_handler.setup_env()
	pipeline_handler.submit(no_run=True)
Esempio n. 4
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (may not exist)")
    parser.add_argument('--name',
                        help="Give this analysis run a name (used in email and report)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    #site = get_site()
    default = get_default_queue('slave')
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = get_default_queue('master')
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")
    cfg_group = parser.add_argument_group('Configuration files (advanced)')
    cfg_group.add_argument('--sample-cfg',
                           help="Config-file (YAML) listing samples and readunits."
                           " Collides with -1, -2 and -s")
    for name, descr in [("references", "reference sequences"),
                        ("params", "parameters"),
                        ("modules", "modules")]:
        default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name)))
        cfg_group.add_argument('--{}-cfg'.format(name),
                               default=default,
                               help="Config-file (yaml) for {}. (default: {})".format(descr, default))
        
    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with --sample-cfg.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument('-t', "--seqtype", required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l', "--intervals",
                        help="Intervals file (e.g. bed file) listing regions of interest."
                        " Required for WES and targeted sequencing.")
    parser.add_argument('-D', '--dont-mark-dups', action='store_true',
                        help="Don't mark duplicate reads")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME how to?
    #for p in ['bwa', 'samtools']:
    #    if not ref_is_indexed(args.reffa, p):
    #        logger.fatal("Reference '%s' doesn't appear to be indexed with %s", args.reffa, p)
    #        sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.intervals:
            logger.fatal("Analysis of exome and targeted sequence runs requires a bed file")
            sys.exit(1)
        else:
            if not os.path.exists(args.intervals):
                logger.fatal("Intervals file %s does not exist", args.sample_cfg)
                sys.exit(1)
            logger.warning("Compatilibity between interval file and"
                           " reference not checked")# FIXME

    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    user_data['readunits'] = readunits
    user_data['samples'] = samples
    if args.name:
        user_data['analysis_name'] = args.name
    


    user_data['seqtype'] = args.seqtype
    user_data['intervals'] = args.intervals
    user_data['mark_dups'] = not args.dont_mark_dups

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data,
        master_q=args.master_q,
        slave_q=args.slave_q,
        params_cfgfile=args.params_cfg,
        modules_cfgfile=args.modules_cfg,
        refs_cfgfile=args.references_cfg,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 5
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        "--control-fq1",
        nargs="+",
        help="Control FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--control-fq2',
        nargs="+",
        help=
        "Control FastQ file/s (if paired) (gzip only). See also --control-fq1")
    parser.add_argument(
        "--treatment-fq1",
        nargs="+",
        help="Treatment FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--treatment-fq2',
        nargs="+",
        help=
        "Treatment FastQ file/s (if paired) (gzip only). See also --treatment-fq1"
    )
    parser.add_argument(
        '--control-bam',
        help="Advanced: Injects control BAM (overwrites control-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )
    parser.add_argument(
        '--treatment-bam',
        help="Advanced: Injects treatment BAM (overwrites treatment-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )
    choices = ['bwa-aln', 'bwa-mem']
    default = choices[0]
    parser.add_argument('--mapper',
                        default=default,
                        choices=choices,
                        help="Mapper to use. One of {}. Default {}".format(
                            ",".join(choices), default))

    choices = ['TF', 'histone-narrow', 'histone-broad']  #, 'open-chromatin']
    parser.add_argument('-t',
                        '--peak-type',
                        required=True,
                        choices=choices,
                        help="Peak type. One of {}".format(",".join(choices)))
    parser.add_argument('--skip-macs2',
                        action='store_true',
                        help="Don't run MACS2")
    parser.add_argument('--skip-dfilter',
                        action='store_true',
                        help="Don't run DFilter")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([
                args.control_fq1, args.control_fq2, args.treatment_fq1,
                args.treatment_fq2, args.control_bam, args.treatment_bam
        ]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
            if not os.path.exists(args.sample_cfg):
                logger.fatal("Config file %s does not exist", args.sample_cfg)
                sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        samples = dict()

        if args.control_bam:
            control_readunits = dict()
            samples["control"] = []
            assert os.path.exists(args.control_bam)
        else:
            if not all([args.control_fq1, args.treatment_fq1]):
                logger.fatal(
                    "Need at least fq1 and sample without config file")
                sys.exit(1)
            control_readunits = get_readunits_from_args(
                args.control_fq1, args.control_fq2)
            samples["control"] = list(control_readunits.keys())

        if args.treatment_bam:
            treatment_readunits = dict()
            samples["treatment"] = []
            assert os.path.exists(args.treatment_bam)
        else:
            treatment_readunits = get_readunits_from_args(
                args.treatment_fq1, args.treatment_fq2)
            samples["treatment"] = list(treatment_readunits.keys())

        readunits = dict(control_readunits)
        readunits.update(treatment_readunits)

    assert sorted(samples) == sorted(["control", "treatment"])

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    # either paired end or not, but no mix allows
    if all([ru.get('fq2') for ru in readunits.values()]):
        cfg_dict['paired_end'] = True
    elif not any([ru.get('fq2') for ru in readunits.values()]):
        cfg_dict['paired_end'] = False
    else:
        logger.fatal("Mixed paired-end and single-end not allowed")
        sys.exit(1)
    cfg_dict['peak_type'] = args.peak_type
    cfg_dict['mapper'] = args.mapper
    cfg_dict['skip_macs2'] = args.skip_macs2
    cfg_dict['skip_dfilter'] = args.skip_dfilter

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()

    if args.control_bam or args.treatment_bam:
        raise NotImplementedError("BAM injection not implemented yet")

    pipeline_handler.submit(args.no_run)
Esempio n. 6
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        "--normal-fq1",
        nargs="+",
        help="Normal FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--normal-fq2',
        nargs="+",
        help=
        "Normal FastQ file/s (if paired) (gzip only). See also --normal-fq1")
    parser.add_argument(
        "--tumor-fq1",
        nargs="+",
        help="Tumor FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--tumor-fq2',
        nargs="+",
        help="Tumor FastQ file/s (if paired) (gzip only). See also --tumor-fq1"
    )
    parser.add_argument('-t',
                        "--seqtype",
                        required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l',
                        "--bed",
                        help="Bed file listing regions of interest."
                        " Required for WES and targeted sequencing.")
    parser.add_argument('-D',
                        '--dont-mark-dups',
                        action='store_true',
                        help="Don't mark duplicate reads")
    parser.add_argument(
        '--normal-bam',
        help="Advanced: Injects normal BAM (overwrites normal-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )
    parser.add_argument(
        '--tumor-bam',
        help="Advanced: Injects tumor BAM (overwrites tumor-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([
                args.normal_fq1, args.normal_fq2, args.tumor_fq1,
                args.tumor_fq2, args.normal_bam, args.tumor_bam
        ]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        samples = dict()

        if args.normal_bam:
            normal_readunits = dict()
            samples["normal"] = []
            assert os.path.exists(args.normal_bam)
        else:
            if not all([args.normal_fq1, args.tumor_fq1]):
                logger.fatal(
                    "Need at least fq1 and sample without config file")
                sys.exit(1)
            normal_readunits = get_readunits_from_args(args.normal_fq1,
                                                       args.normal_fq2)
            samples["normal"] = list(normal_readunits.keys())

        if args.tumor_bam:
            tumor_readunits = dict()
            samples["tumor"] = []
            assert os.path.exists(args.tumor_bam)
        else:
            tumor_readunits = get_readunits_from_args(args.tumor_fq1,
                                                      args.tumor_fq2)
            samples["tumor"] = list(tumor_readunits.keys())

        readunits = dict(normal_readunits)
        readunits.update(tumor_readunits)

    assert sorted(samples) == sorted(["normal", "tumor"])

    # FIXME howt to
    # if not os.path.exists(reffa):
    #    logger.fatal("Reference '%s' doesn't exist", reffa)
    #    sys.exit(1)
    #
    #for p in ['bwa', 'samtools']:
    #    if not ref_is_indexed(reffa, p):
    #        logger.fatal("Reference '%s' doesn't appear to be indexed with %s", reffa, p)
    #        sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.bed:
            logger.fatal(
                "Analysis of exome and targeted sequence runs requires a bed file"
            )
            sys.exit(1)
        else:
            if not os.path.exists(args.bed):
                logger.fatal("Bed file %s does not exist", args.sample_cfg)
                sys.exit(1)
            logger.warning("Compatilibity between bed file and"
                           " reference not checked")  # FIXME

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    cfg_dict['seqtype'] = args.seqtype
    cfg_dict['intervals'] = os.path.abspath(args.bed) if args.bed else None
    cfg_dict['mark_dups'] = not args.dont_mark_dups

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))

    pipeline_handler.setup_env()

    # inject existing BAM by symlinking (everything upstream is temporary anyway)
    for sample, bam in [("normal", args.normal_bam),
                        ("tumor", args.tumor_bam)]:
        if bam:
            # target as defined in Snakefile!
            target = os.path.join(
                args.outdir, "out", sample,
                "{}.bwamem.lofreq.dedup.lacer.bam".format(sample))
            os.makedirs(os.path.dirname(target))
            os.symlink(os.path.abspath(bam), target)

    pipeline_handler.submit(args.no_run)
Esempio n. 7
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-c', "--config",
                        help="Config file (YAML) listing samples and readunits."
                        " Collides with -1, -2 and -s")
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (must not exist)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")

    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with -c.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with -c.")
    fake_pipeline_handler = PipelineHandler("FAKE", PIPELINE_BASEDIR, "FAKE", None)
    default_cfg = fake_pipeline_handler.read_default_config()
    default = default_cfg['references']['genome']
    parser.add_argument('-r', "--reffa", default=default,
                        help=argparse.SUPPRESS)
                        # WARN do not change. this is just to set args.reffa (used later).
                        # any change here would require changes in dbsnp, hapmap, g1k, omni and mills as well
    parser.add_argument('-t', "--seqtype", required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l', "--intervals",
                        help="Intervals file (e.g. bed file) listing regions of interest."
                        " Required for WES and targeted sequencing.")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.config:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.config):
            logger.fatal("Config file %s does not exist", args.config)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.config)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    if args.seqtype in ['WES', 'targeted']:
        if not args.intervals:
            logger.fatal("Analysis of exome and targeted sequence runs requires a bed file")
            sys.exit(1)
        else:
            if not os.path.exists(args.intervals):
                logger.fatal("Intervals file %s does not exist", args.config)
                sys.exit(1)
            logger.warning("Compatilibity between interval file and"
                           " reference not checked")# FIXME

    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    user_data['readunits'] = readunits
    user_data['samples'] = samples

    user_data['num_chroms'] = len(list(chroms_and_lens_from_from_fasta(args.reffa)))
    user_data['seqtype'] = args.seqtype
    user_data['intervals'] = args.intervals# always safe, might be used for WGS as well
    user_data['mark_dups'] = MARK_DUPS

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data, site=site,
        master_q=args.master_q, slave_q=args.slave_q)
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 8
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        '-1',
        "--fq1",
        nargs="+",
        help="FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '-2',
        "--fq2",
        nargs="+",
        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s',
                        "--sample",
                        help="Sample name. Collides with --sample-cfg.")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME now exported to ref.cfg. how to auto check there?
    #if not os.path.exists(args.reffa):
    #    logger.fatal("Reference '%s' doesn't exist", args.reffa)
    #    sys.exit(1)
    #
    #for p in ['bwa', 'samtools']:
    #    if not ref_is_indexed(args.reffa, p):
    #        logger.fatal("Reference '%s' doesn't appear to be indexed"
    #                     " with %s", args.reffa, p)
    #        sys.exit(1)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['mark_dups'] = MARK_DUPS

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 9
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-c', "--config",
                        help="Config file (YAML) listing samples and readunits."
                        " Collides with -1, -2 and -s")
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (must not exist)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")

    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with -c.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with -c.")
    parser.add_argument('-C', "--cuffdiff", action='store_true',
                        dest="run_cuffdiff",
                        help="Also run cuffdiff")
    parser.add_argument('-S', '--stranded', action='store_true',
                        help="Stranded library prep (default is unstranded)")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.config:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.config):
            logger.fatal("Config file %s does not exist", args.config)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.config)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME checks on reffa index (currently not exposed via args)


    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    user_data['readunits'] = readunits
    user_data['samples'] = samples

    user_data['stranded'] = args.stranded
    user_data['run_cuffdiff'] = args.run_cuffdiff
    user_data['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if user_data['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data, site=site,
        master_q=args.master_q, slave_q=args.slave_q)
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 10
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument("--cuffdiff",
                        action='store_true',
                        dest="run_cuffdiff",
                        help="Also run cuffdiff")
    choices = ["none", "forward", "reverse"]
    default = "none"
    parser.add_argument(
        '--stranded',
        choices=choices,
        default=default,
        help=
        "Stranded library prep (default is {}; Following RSEM definition but see also"
        " http://chipster.csc.fi/manual/library-type-summary.html)".format(
            default))
    parser.add_argument(
        '--rsem-estimate-rspd',
        action='store_true',
        help="Estimate read start position distribution in RSEM")
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME add checks on reffa index (currently not exposed via args)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['rsem_extra_args'] = ''
    if args.rsem_estimate_rspd:
        cfg_dict['rsem_extra_args'] += ' --estimate-rspd'
    cfg_dict['stranded'] = args.stranded
    cfg_dict['run_cuffdiff'] = args.run_cuffdiff
    cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if cfg_dict['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 11
0
def main():
    """main function
    """

    # FIXME ugly and code duplication in bcl2fastq_dbupdate.py
    mongo_status_script = os.path.abspath(
        os.path.join(os.path.dirname(sys.argv[0]), "mongo_status.py"))
    assert os.path.exists(mongo_status_script)

    default_parser = default_argparser(CFG_DIR,
                                       allow_missing_cfgfile=True,
                                       allow_missing_outdir=True,
                                       default_db_logging=True)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])
    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-r',
                        "--runid",
                        help="Run ID plus flowcell ID (clashes with -d)")
    parser.add_argument(
        '-d',
        "--rundir",
        help=
        "BCL input directory (clashes with -r; you also probably want to disable logging)"
    )
    parser.add_argument('-t',
                        "--testing",
                        action='store_true',
                        help="Use MongoDB test server")
    parser.add_argument('--no-archive',
                        action='store_true',
                        help="Don't archieve this analysis")
    parser.add_argument(
        '-l',
        '--lanes',
        type=int,
        nargs="*",
        help="Limit run to given lane/s (multiples separated by space")
    parser.add_argument(
        '-i',
        '--mismatches',
        type=int,
        help="Max. number of allowed barcode mismatches (0>=x<=2)"
        " setting a value here overrides the default settings read from ELM)")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if args.mismatches is not None:
        if args.mismatches > 2 or args.mismatches < 0:
            logger.fatal("Number of mismatches must be between 0-2")
            sys.exit(1)

    lane_info = ''
    lane_nos = []
    if args.lanes:
        lane_info = '--tiles '
        for lane in args.lanes:
            if lane > 8 or lane < 1:
                logger.fatal("Lane number must be between 1-8")
                sys.exit(1)
            else:
                lane_info += 's_{}'.format(lane) + ','
        lane_info = lane_info.rstrip()
        lane_info = lane_info[:-1]
        lane_nos = list(args.lanes)

    if args.runid and args.rundir:
        logger.fatal(
            "Cannot use run-id and input directory arguments simultaneously")
        sys.exit(1)
    elif args.runid:
        rundir = run_folder_for_run_id(args.runid)
    elif args.rundir:
        rundir = os.path.abspath(args.rundir)
    else:
        logger.fatal("Need either run-id or input directory")
        sys.exit(1)
    if not os.path.exists(rundir):
        logger.fatal("Expected run directory %s does not exist", rundir)
    logger.info("Rundir is %s", rundir)

    if not args.outdir:
        outdir = get_bcl2fastq_outdir(args.runid)
        args.outdir = outdir
    else:
        outdir = args.outdir
    if os.path.exists(outdir):
        logger.fatal("Output directory %s already exists", outdir)
        sys.exit(1)
    # create now so that generate_bcl2fastq_cfg.py can run
    os.makedirs(outdir)

    # catch cases where rundir was user provided and looks weird
    try:
        _, runid, flowcellid = get_machine_run_flowcell_id(rundir)
        run_num = runid + "_" + flowcellid
    except:
        run_num = "UNKNOWN-" + rundir.split("/")[-1]

    # call generate_bcl2fastq_cfg
    #
    # FIXME ugly assumes same directory (just like import above). better to import and run main()?
    generate_bcl2fastq = os.path.join(os.path.dirname(sys.argv[0]),
                                      "generate_bcl2fastq_cfg.py")
    assert os.path.exists(generate_bcl2fastq)
    cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir]
    if args.testing:
        cmd.append("-t")
    logger.debug("Executing %s", ' '.join(cmd))
    try:
        res = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        logger.fatal("The following command failed with return code %s: %s",
                     e.returncode, ' '.join(cmd))
        logger.fatal("Output: %s", e.output.decode())
        logger.fatal("Exiting")
        os.rmdir(outdir)
        sys.exit(1)
    # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it
    # use sys instead of logger to avoid double logging
    if res:
        sys.stderr.write(res.decode())

    # just created files
    muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG)
    status_cfg = os.path.join(outdir, STATUS_CFG)

    # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files
    #
    if any([not os.path.exists(x) for x in [muxinfo_cfg]]):
        # one missing means all should be missing
        assert all([not os.path.exists(x) for x in [muxinfo_cfg]])
        #Check status as seqrunfailed or non-bcl run
        with open(status_cfg, 'r') as fh:
            status = fh.read().strip()
        update_run_status(mongo_status_script, run_num, outdir, status,
                          args.testing)
        sys.exit(0)

    # turn arguments into cfg_dict that gets merged into pipeline config
    cfg_dict = {
        'rundir': rundir,
        'lanes_arg': lane_info,
        'no_archive': args.no_archive,
        'run_num': run_num
    }

    mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos)
    if args.mismatches is not None:
        mux_units = [
            mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units
        ]
    os.unlink(muxinfo_cfg)

    cfg_dict['units'] = dict()
    for mu in mux_units:
        # special case: mux split across multiple lanes. make lanes a list
        # and add in extra lanes if needed.
        k = mu.mux_dir
        mu_dict = dict(mu._asdict())
        cfg_dict['units'][k] = mu_dict

    # create mongodb update command, used later, after submission
    mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script,
                                                    cfg_dict['run_num'])
    mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(
        outdir)  # set in run.sh
    if args.testing:
        mongo_update_cmd += " -t"

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        logger_cmd=mongo_update_cmd,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))

    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 12
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-t',
                        "--seqtype",
                        required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l',
                        "--bed",
                        help="Bed file listing regions of interest."
                        " Required for WES and targeted sequencing.")
    default = 4
    parser.add_argument(
        "-c",
        "--hc-nct",
        default=default,
        type=int,
        help="Number of Haplotype Caller threads (per region cluster)."
        " Values>1 reported to make Haplotype Caller unstable (default={})".
        format(default))
    default = 100
    parser.add_argument(
        '-i',
        "--interval-padding",
        default=default,
        help="Interval padding (for non-WGS only; default = {})".format(
            default))
    parser.add_argument(
        '-j',
        "--joint-calls",
        action='store_true',
        help="Perform joint/cohort calling (requires multisample input)")
    parser.add_argument(
        '--raw-bam',
        help=
        "Advanced: Injects raw (pre-dedup, pre-BQSR etc.) BAM (overwrites fq options)."
        " WARNING: reference needs to match pipeline requirements")
    parser.add_argument(
        '--proc-bam',
        help=
        "Advanced: Injects processed (post-dedup, post-BQSR etc.) BAM (overwrites fq options)."
        " WARNING: reference and pre-processing need to match pipeline requirements"
    )
    # FIXME can be achieved with --until rule as well
    parser.add_argument('--bam-only',
                        action='store_true',
                        help="Only process up until BAM file")
    parser.add_argument('--gvcf-only',
                        action='store_true',
                        help="Only process up until GVCF file")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample, args.raw_bam, args.proc_bam]):
            logger.fatal(
                "Config file overrides fastq, sample and BAM arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.raw_bam or args.proc_bam:
            assert not args.fq1, ("BAM injection overwrites fastq arguments")

            if args.raw_bam:
                assert os.path.exists(args.raw_bam)
                assert not args.proc_bam, (
                    "Cannot inject raw and processed BAM")
            if args.proc_bam:
                assert os.path.exists(args.proc_bam)
                assert not args.raw_bam, (
                    "Cannot inject raw and processed BAM")

            readunits = dict()
            samples[args.sample] = []

        elif args.fq1:

            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.bed:
            logger.fatal(
                "Analysis of exome and targeted sequence runs requires a bed file"
            )
            sys.exit(1)
        else:
            if not os.path.exists(args.bed):
                logger.fatal("Bed file %s does not exist", args.sample_cfg)
                sys.exit(1)

    if args.joint_calls:
        if len(samples) < 2:
            logger.fatal("Need at least two samples for joint calling")
            sys.exit(1)

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['seqtype'] = args.seqtype
    cfg_dict['intervals'] = os.path.abspath(
        args.bed
    ) if args.bed else None  # always safe, might be used for WGS as well
    cfg_dict['mark_dups'] = MARK_DUPS
    cfg_dict['bam_only'] = args.bam_only
    cfg_dict['gvcf_only'] = args.gvcf_only
    cfg_dict['hc_nct'] = args.hc_nct
    cfg_dict['joint_calls'] = args.joint_calls
    cfg_dict['interval_padding'] = args.interval_padding
    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()

    # Inject existing BAM by symlinking (everything upstream is temporary anyway)
    # WARNING: filename has to match definition in Snakefile!
    if args.raw_bam:
        target = os.path.join(args.outdir, "out", args.sample,
                              "{}.bwamem.bam".format(args.sample))
        os.makedirs(os.path.dirname(target))
        os.symlink(os.path.abspath(args.raw_bam), target)

        src_bai = os.path.abspath(args.raw_bam) + ".bai"
        if os.path.exists(src_bai):
            os.symlink(src_bai, target + ".bai")

    elif args.proc_bam:
        target = os.path.join(args.outdir, "out", args.sample,
                              "{}.bwamem".format(args.sample))
        if cfg_dict['mark_dups']:
            target += ".dedup"
        if cfg_dict['seqtype'] != 'targeted':
            target += ".bqsr"
        target += ".bam"
        os.makedirs(os.path.dirname(target))
        os.symlink(os.path.abspath(args.proc_bam), target)
        if os.path.exists(os.path.abspath(args.proc_bam) + ".bai"):
            os.symlink(
                os.path.abspath(args.proc_bam) + ".bai", target + ".bai")

    pipeline_handler.submit(args.no_run)
Esempio n. 13
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    default = 2000
    parser.add_argument(
        "--fragment-length",
        type=int,
        default=default,
        help="Fragment length argument for Bowtie (default {})".format(
            default))
    default = 200
    parser.add_argument(
        "--extsize",
        type=int,
        default=default,
        help=
        "extsize argument for MACS2; only used for single-end reads (default {})"
        .format(default))
    default = -100
    parser.add_argument(
        "--shift",
        type=int,
        default=default,
        help=
        "shift argument for MACS2; only used for single-end reads (default {})"
        .format(default))
    default = 250
    parser.add_argument(
        "--peak-ext-bp",
        type=int,
        default=default,
        help="Extension around peaks for bed creation (default {})".format(
            default))
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if cfg_dict['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")
    cfg_dict['mapper'] = 'bowtie2'  # FIXME fixed for now
    # cfg_dict["bowtie2_custom_args"]
    # cfg_dict['platform']
    # cfg_dict['center']
    # cfg_dict["macs2_custom_args"]

    cfg_dict['fragment_length'] = args.fragment_length
    cfg_dict['shift'] = args.shift
    cfg_dict['extsize'] = args.extsize
    cfg_dict["peak_ext_bp"] = args.peak_ext_bp

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 14
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (must not exist)")
    parser.add_argument('--name',
                        help="Give this analysis run a name (used in email and report)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")
    cfg_group = parser.add_argument_group('Configuration files (advanced)')
    cfg_group.add_argument('--prev-cfg',
                           help="Previously used config. Also used to infer path to precalculated BAM files")
    for name, descr in [("references", "reference sequences"),
                        ("params", "parameters"),
                        ("modules", "modules")]:
        default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name)))
        cfg_group.add_argument('--{}-cfg'.format(name),
                               default=default,
                               help="Config-file (yaml) for {}. (default: {})".format(descr, default))

    # pipeline specific args
    #parser.add_argument('-1', "--fq1", nargs="+",
    #                    help="FastQ file/s (gzip only)."
    #                    " Multiple input files supported (auto-sorted)."
    #                    " Note: each file (or pair) gets a unique read-group id."
    #                    " Collides with --sample-cfg.")
    #parser.add_argument('-2', "--fq2", nargs="+",
    #                    help="FastQ file/s (if paired) (gzip only). See also --fq1")
    #parser.add_argument('-s', "--sample",
    #                    help="Sample name. Collides with --sample-cfg.")
    #parser.add_argument('-t', "--seqtype", required=True,
    #                    choices=['WGS', 'WES', 'targeted'],
    #                    help="Sequencing type")
    #parser.add_argument('-l', "--intervals",
    #                    help="Intervals file (e.g. bed file) listing regions of interest."
    #                    " Required for WES and targeted sequencing.")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)


    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    #if args.sample_cfg:
    #    if any([args.fq1, args.fq2, args.sample]):
    #        logger.fatal("Config file overrides fastq and sample input arguments."
    #                     " Use one or the other")
    #        sys.exit(1)
    #    if not os.path.exists(args.sample_cfg):
    #        logger.fatal("Config file %s does not exist", args.sample_cfg)
    #        sys.exit(1)
    #    samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg)
    #else:
    #    if not all([args.fq1, args.sample]):
    #        logger.fatal("Need at least fq1 and sample without config file")
    #        sys.exit(1)
    #
    #    readunits = get_readunits_from_args(args.fq1, args.fq2)
    #    # all readunits go into this one sample specified on the command-line
    #    samples = dict()
    #    samples[args.sample] = list(readunits.keys())
    #
    #if args.seqtype in ['WES', 'targeted']:
    #    if not args.intervals:
    #        logger.fatal("Analysis of exome and targeted sequence runs requires a bed file")
    #        sys.exit(1)
    #    else:
    #        if not os.path.exists(args.intervals):
    #            logger.fatal("Intervals file %s does not exist", args.sample_cfg)
    #            sys.exit(1)
    #        logger.warning("Compatilibity between interval file and"
    #                       " reference not checked")# FIXME

    with open(args.prev_cfg, 'r') as stream:
        try:
            prev_cfg = yaml.load(stream)
        except yaml.YAMLError as exc:
            logger.fatal("Error loading %s", REST_CFG)
            raise
    #import pdb; pdb.set_trace()
    #sys.stderr.write("TMP DEBUG {}\n".format(prev_cfg))
    
    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    #user_data['readunits'] = prev_cfg['readunits'] 
    user_data['readunits'] = dict()# None won't work
    #user_data['samples'] = samples
    user_data['samples'] = prev_cfg['samples']
    if args.name:
        user_data['analysis_name'] = args.name
    #user_data['seqtype'] = args.seqtype
    user_data['seqtype'] = 'WGS'# SG10K
    #user_data['intervals'] = args.intervals# always safe, might be used for WGS as well
    user_data['intervals'] = None#SG10K
    user_data['mark_dups'] = None# SG10K doesn't matter
    user_data['precalc_bam_dir'] = os.path.join(
        os.path.abspath(os.path.dirname(args.prev_cfg)), "out")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data, site=site,
        master_q=args.master_q,
        slave_q=args.slave_q,
        params_cfgfile=args.params_cfg,
        modules_cfgfile=args.modules_cfg,
        refs_cfgfile=args.references_cfg,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 15
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, allow_missing_cfgfile=True)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        '-1',
        "--fq1",
        nargs="+",
        help="FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '-2',
        "--fq2",
        nargs="+",
        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s',
                        "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument(
        '-r', "--reffa", required=True,
        help="Reference genome")  # FIXME create local copy for indexing?
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.fq1:
            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    for ru in readunits.values():
        assert ru['fq2'], (
            "FastQ R2 missing, but assemblers assume paired-end reads")

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    assert os.path.exists(args.reffa)
    # FIXME only works because yaml missing and thus not overwritten
    cfg_dict['references'] = {'genome': os.path.abspath(args.reffa)}

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 16
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        '-1',
        "--fq1",
        nargs="+",
        help="FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '-2',
        "--fq2",
        nargs="+",
        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s',
                        "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument('-C',
                        "--cuffdiff",
                        action='store_true',
                        dest="run_cuffdiff",
                        help="Also run cuffdiff")
    parser.add_argument('-S',
                        '--stranded',
                        action='store_true',
                        help="Stranded library prep (default is unstranded)")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME checks on reffa index (currently not exposed via args)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    cfg_dict['stranded'] = args.stranded
    cfg_dict['run_cuffdiff'] = args.run_cuffdiff
    cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if cfg_dict['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    default = ['kraken', 'metaphlan2']
    parser.add_argument("-p",
                        "--profilers",
                        nargs='+',
                        default=default,
                        help="Profilers to run (default = {}".format(
                            ", ".join(default)))

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.fq1:
            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    for ru in readunits.values():
        assert ru['fq2'], (
            "FastQ R2 missing, but pipelines requires paired-end reads")

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['profilers'] = args.profilers

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 18
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-t',
                        "--seqtype",
                        required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l',
                        "--bed",
                        help="Bed file listing regions of interest."
                        " Required for WES and targeted sequencing.")
    parser.add_argument('-D',
                        '--dont-mark-dups',
                        action='store_true',
                        help="Don't mark duplicate reads")
    # raw bam not possible because the pipeline splits on the fly into chromosomes
    parser.add_argument(
        '--proc-bam',
        help="Advanced: Injects processed BAM (overwrites fq options)."
        " WARNING: reference and pre-processing need to match pipeline requirements"
    )
    parser.add_argument('--bam-only',
                        action='store_true',
                        help="Don't call variants, just process BAM file")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample, args.proc_bam]):
            logger.fatal("Config file overrides fastq and sample arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.proc_bam:
            assert not args.fq1, ("BAM injection overwrites fastq arguments")

            if args.proc_bam:
                assert os.path.exists(args.proc_bam)

            readunits = dict()
            samples[args.sample] = []

        elif args.fq1:

            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.bed:
            logger.fatal(
                "Analysis of exome and targeted sequence runs requires a bed file"
            )
            sys.exit(1)
        else:
            if not os.path.exists(args.bed):
                logger.fatal("Bed file %s does not exist", args.sample_cfg)
                sys.exit(1)

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['seqtype'] = args.seqtype
    cfg_dict['intervals'] = os.path.abspath(
        args.bed
    ) if args.bed else None  # always safe, might be used for WGS as well
    cfg_dict['mark_dups'] = not args.dont_mark_dups
    cfg_dict['bam_only'] = args.bam_only

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()

    # Inject existing BAM by symlinking (everything upstream is temporary anyway)
    # WARNING: filename has to match definition in Snakefile!
    if args.proc_bam:
        target = os.path.join(args.outdir, "out", args.sample,
                              "{}.bwamem.lofreq".format(args.sample))
        if cfg_dict['mark_dups']:
            target += ".dedup"
        target += ".lacer.bam"
        os.makedirs(os.path.dirname(target))
        os.symlink(os.path.abspath(args.proc_bam), target)

    pipeline_handler.submit(args.no_run)
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    #/
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME implement checks on reffa index (currently not exposed via args)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 20
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(
        description=__doc__.format(PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())
    )

    # generic args
    parser.add_argument("-o", "--outdir", required=True, help="Output directory (may not exist)")
    parser.add_argument("--name", help="Give this analysis run a name (used in email and report)")
    parser.add_argument("--no-mail", action="store_true", help="Don't send mail on completion")
    # site = get_site()
    default = get_default_queue("slave")
    parser.add_argument(
        "-w", "--slave-q", default=default, help="Queue to use for slave jobs (default: {})".format(default)
    )
    default = get_default_queue("master")
    parser.add_argument(
        "-m", "--master-q", default=default, help="Queue to use for master job (default: {})".format(default)
    )
    parser.add_argument("-n", "--no-run", action="store_true")
    parser.add_argument("-v", "--verbose", action="count", default=0, help="Increase verbosity")
    parser.add_argument("-q", "--quiet", action="count", default=0, help="Decrease verbosity")
    cfg_group = parser.add_argument_group("Configuration files (advanced)")
    cfg_group.add_argument(
        "--sample-cfg", help="Config-file (YAML) listing samples and readunits." " Collides with -1, -2 and -s"
    )
    for name, descr in [("references", "reference sequences"), ("params", "parameters"), ("modules", "modules")]:
        default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name)))
        cfg_group.add_argument(
            "--{}-cfg".format(name),
            default=default,
            help="Config-file (yaml) for {}. (default: {})".format(descr, default),
        )

    # pipeline specific args
    parser.add_argument(
        "--normal-fq1",
        nargs="+",
        help="Normal FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.",
    )
    parser.add_argument("--normal-fq2", nargs="+", help="Normal FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument(
        "--tumor-fq1",
        nargs="+",
        help="Tumor FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.",
    )
    parser.add_argument("--tumor-fq2", nargs="+", help="Tumor FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument("-t", "--seqtype", required=True, choices=["WGS", "WES", "targeted"], help="Sequencing type")
    parser.add_argument(
        "-l",
        "--intervals",
        help="Intervals file (e.g. bed file) listing regions of interest." " Required for WES and targeted sequencing.",
    )
    parser.add_argument("-D", "--dont-mark-dups", action="store_true", help="Don't mark duplicate reads")
    parser.add_argument(
        "--normal-bam",
        help="Advanced: Injects normal BAM (overwrites normal-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements",
    )
    parser.add_argument(
        "--tumor-bam",
        help="Advanced: Injects tumor BAM (overwrites tumor-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements",
    )

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.normal_fq1, args.normal_fq2, args.tumor_fq1, args.tumor_fq2, args.normal_bam, args.tumor_bam]):
            logger.fatal("Config file overrides fastq and sample input arguments." " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg)
    else:
        samples = dict()

        if args.normal_bam:
            normal_readunits = dict()
            samples["normal"] = []
            assert os.path.exists(args.normal_bam)
        else:
            if not all([args.normal_fq1, args.tumor_fq1]):
                logger.fatal("Need at least fq1 and sample without config file")
                sys.exit(1)
            normal_readunits = get_readunits_from_args(args.normal_fq1, args.normal_fq2)
            samples["normal"] = list(normal_readunits.keys())

        if args.tumor_bam:
            tumor_readunits = dict()
            samples["tumor"] = []
            assert os.path.exists(args.tumor_bam)
        else:
            tumor_readunits = get_readunits_from_args(args.tumor_fq1, args.tumor_fq2)
            samples["tumor"] = list(tumor_readunits.keys())

        readunits = dict(normal_readunits)
        readunits.update(tumor_readunits)

    assert sorted(samples) == sorted(["normal", "tumor"])

    # FIXME howt to
    # if not os.path.exists(reffa):
    #    logger.fatal("Reference '%s' doesn't exist", reffa)
    #    sys.exit(1)
    #
    # for p in ['bwa', 'samtools']:
    #    if not ref_is_indexed(reffa, p):
    #        logger.fatal("Reference '%s' doesn't appear to be indexed with %s", reffa, p)
    #        sys.exit(1)

    if args.seqtype in ["WES", "targeted"]:
        if not args.intervals:
            logger.fatal("Analysis of exome and targeted sequence runs requires a bed file")
            sys.exit(1)
        else:
            if not os.path.exists(args.intervals):
                logger.fatal("Intervals file %s does not exist", args.sample_cfg)
                sys.exit(1)
            logger.warning("Compatilibity between interval file and" " reference not checked")  # FIXME

    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data["mail_on_completion"] = not args.no_mail
    user_data["readunits"] = readunits
    user_data["samples"] = samples
    if args.name:
        user_data["analysis_name"] = args.name

    user_data["seqtype"] = args.seqtype
    user_data["intervals"] = args.intervals
    user_data["mark_dups"] = not args.dont_mark_dups

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args.outdir,
        user_data,
        master_q=args.master_q,
        slave_q=args.slave_q,
        params_cfgfile=args.params_cfg,
        modules_cfgfile=args.modules_cfg,
        refs_cfgfile=args.references_cfg,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR),
    )

    pipeline_handler.setup_env()

    # inject existing BAM by symlinking (everything upstream is temporary anyway)
    for sample, bam in [("normal", args.normal_bam), ("tumor", args.tumor_bam)]:
        if bam:
            # target as defined in Snakefile!
            target = os.path.join(args.outdir, "out", sample, "{}.bwamem.lofreq.dedup.lacer.bam".format(sample))
            os.makedirs(os.path.dirname(target))
            os.symlink(os.path.abspath(bam), target)

    pipeline_handler.submit(args.no_run)
Esempio n. 21
0
def main():
    """main function
    """

    # FIXME ugly and code duplication in bcl2fastq_dbupdate.py
    mongo_status_script = os.path.abspath(os.path.join(
        os.path.dirname(sys.argv[0]), "mongo_status.py"))
    assert os.path.exists(mongo_status_script)

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))
    parser.add_argument('-r', "--runid",
                        help="Run ID plus flowcell ID (clashes with -d)")
    parser.add_argument('-d', "--rundir",
                        help="BCL input directory (clashes with -r)")
    parser.add_argument('-o', "--outdir",
                        help="Output directory (must not exist; required if called by user)")
    parser.add_argument('-t', "--testing", action='store_true',
                        help="Use MongoDB test server")
    parser.add_argument('--no-archive', action='store_true',
                        help="Don't archieve this analysis")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-l', '--lanes', type=int, nargs="*",
                        help="Limit run to given lane/s (multiples separated by space")
    parser.add_argument('-i', '--mismatches', type=int,
                        help="Max. number of allowed barcode mismatches (0>=x<=2)"
                        " setting a value here overrides the default settings read from ELM)")
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")


    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if args.mismatches is not None:
        if args.mismatches > 2 or args.mismatches < 0:
            logger.fatal("Number of mismatches must be between 0-2")
            sys.exit(1)

    lane_info = ''
    lane_nos = []
    if args.lanes:
        lane_info = '--tiles '
        for lane in args.lanes:
            if lane > 8 or lane < 1:
                logger.fatal("Lane number must be between 1-8")
                sys.exit(1)
            else:
                lane_info += 's_{}'.format(lane)+','
        lane_info = lane_info.rstrip()
        lane_info = lane_info[:-1]
        lane_nos = list(args.lanes)


    if args.runid and args.rundir:
        logger.fatal("Cannot use run-id and input directory arguments simultaneously")
        sys.exit(1)
    elif args.runid:
        rundir = run_folder_for_run_id(args.runid)
    elif args.rundir:
        rundir = os.path.abspath(args.rundir)
    else:
        logger.fatal("Need either run-id or input directory")
        sys.exit(1)
    if not os.path.exists(rundir):
        logger.fatal("Expected run directory {} does not exist".format(rundir))
    logger.info("Rundir is {}".format(rundir))

    if not args.outdir:
        outdir = get_bcl2fastq_outdir(args.runid)
    else:
        outdir = args.outdir
    if os.path.exists(outdir):
        logger.fatal("Output directory %s already exists", outdir)
        sys.exit(1)
    # create now so that generate_bcl2fastq_cfg.py can run
    os.makedirs(outdir)
    


    # catch cases where rundir was user provided and looks weird
    try:
        _, runid, flowcellid = get_machine_run_flowcell_id(rundir)
        run_num = runid + "_" + flowcellid
    except:
        run_num = "UNKNOWN-" + rundir.split("/")[-1]


    # call generate_bcl2fastq_cfg
    #
    # FIXME ugly assumes same directory (just like import above). better to import and run main()?
    generate_bcl2fastq = os.path.join(
        os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py")
    assert os.path.exists(generate_bcl2fastq)
    cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir]
    if args.testing:
        cmd.append("-t")
    logger.debug("Executing {}".format(' ' .join(cmd)))
    try:
        res = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        logger.fatal("The following command failed with return code {}: {}".format(
            e.returncode, ' '.join(cmd)))
        logger.fatal("Output: {}".format(e.output.decode()))
        logger.fatal("Exiting")
        sys.exit(1)
    # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it
    # use sys instead of logger to avoid double logging
    if res:
        sys.stderr.write(res.decode())

    # just created files
    muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG)
    samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV)
    usebases_cfg = os.path.join(outdir, USEBASES_CFG)

    # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files
    #
    if any([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]):
        # one missing means all should be missing
        assert all([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]])
        seqrunfailed(mongo_status_script, run_num, outdir, args.testing)
        sys.exit(0)


    # turn arguments into user_data that gets merged into pipeline config
    user_data = {'rundir': rundir,
                 'lanes_arg': lane_info,
                 'samplesheet_csv': samplesheet_csv,
                 'no_archive': args.no_archive,
                 'mail_on_completion': not args.no_mail,
                 'run_num': run_num}


    usebases_arg = ''
    with open(usebases_cfg, 'r') as stream:
        try:
            d = yaml.load(stream)
            assert 'usebases' in d
            assert len(d) == 1# make sure usebases is only key
            for ub in d['usebases']:
                #print (ub)
                usebases_arg += '--use-bases-mask {} '.format(ub)
            #user_data = {'usebases_arg' : usebases_arg}
        except yaml.YAMLError as exc:
            logger.fatal(exc)
            raise
    user_data['usebases_arg'] = usebases_arg
    os.unlink(usebases_cfg)


    mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos)
    if args.mismatches is not None:
        mux_units = [mu._replace(barcode_mismatches=args.mismatches)
                     for mu in mux_units]
    os.unlink(muxinfo_cfg)


    user_data['units'] = dict()
    for mu in mux_units:
        # special case: mux split across multiple lanes. make lanes a list
        # and add in extra lanes if needed.
        k = mu.mux_dir
        mu_dict = dict(mu._asdict())
        user_data['units'][k] = mu_dict

    # create mongodb update command, used later, after queueing
    mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, user_data['run_num'])
    mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh
    if args.testing:
        mongo_update_cmd += " -t"

    # NOTE: bcl2fastq has a special run template, so we need to
    # interfer with the default pipeline_handler.  plenty of
    # opportunity to shoot yourself in the foot

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR, outdir, user_data,
        site=site, master_q=args.master_q, slave_q=args.slave_q)
    # use local run template
    pipeline_handler.run_template = os.path.join(
        PIPELINE_BASEDIR, "run.template.{}.sh".format(pipeline_handler.site))
    assert os.path.exists(pipeline_handler.run_template)
    pipeline_handler.setup_env()
    # final mongo update line in run_out
    tmp_run_out = pipeline_handler.run_out + ".tmp"
    with open(pipeline_handler.run_out) as fh_in, \
         open(tmp_run_out, 'w') as fh_out:
        for line in fh_in:
            line = line.replace("@MONGO_UPDATE_CMD@", mongo_update_cmd)
            fh_out.write(line)
    shutil.move(tmp_run_out, pipeline_handler.run_out)
    pipeline_handler.submit(args.no_run)
Esempio n. 22
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with --sample-cfg.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument('-D', '--dont-mark-dups', action='store_true',
                        help="Don't mark duplicate reads")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    cfg_dict['mark_dups'] = not args.dont_mark_dups

    # create mongodb update command, used later, after submission
    #mongo_update_cmd = "true"{} -r {} -s STARTED".format(mongo_status_script, cfg_dict['run_num'])
    #mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh
    #if args.testing:
    #    mongo_update_cmd += " -t"

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args, cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))

    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Esempio n. 23
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    #/
    parser.add_argument('-c',
                        "--cell-barcodes",
                        required=True,
                        help="File listing cell barcodes")
    d = 200
    parser.add_argument(
        "--frag-len",
        default=d,
        type=int,
        help="Estimated fragment length (default={})".format(d))
    d = 20.0
    parser.add_argument(
        '--frag-len-sd',
        default=d,
        type=float,
        help="Estimated fragment length standard deviation (default={})".
        format(d))
    parser.add_argument(
        '--dedup',
        action="store_true",
        help="Run UMI-based deduplication (slow for large data-sets!)")
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file '%s' does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    for ru in readunits.values():
        assert ru['fq2'], (
            "FastQ R2 missing, but pipeline requires paired-end reads")

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    if not os.path.exists(args.cell_barcodes):
        logger.fatal("Cellular barcodes file '%s' does not exist",
                     args.cell_barcodes)
        sys.exit(1)

    cfg_dict['cell_barcodes'] = os.path.abspath(args.cell_barcodes)
    cfg_dict['frag_len'] = args.frag_len
    cfg_dict['frag_len_sd'] = args.frag_len_sd
    cfg_dict['no_dedup'] = not args.dedup
    cfg_dict['scrnapipe_transform'] = os.path.abspath(
        os.path.join(PIPELINE_BASEDIR, 'aux/transform.json'))
    cfg_dict['scrna_conf_template'] = os.path.abspath(
        os.path.join(PIPELINE_BASEDIR, 'aux/scrna.conf.template'))
    cfg_dict['adapters'] = os.path.abspath(
        os.path.join(PIPELINE_BASEDIR, 'aux/adapters.fa'))

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)