Exemple #1
0
def run_folder_for_run_id(runid_and_flowcellid, site=None, basedir_map=SEQDIR_BASE):
    """runid has to contain flowcell id

    AKA $RAWSEQDIR

    run_folder_for_run_id('HS004-PE-R00139_BC6A7HANXX')
    >>> "/mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX"
    if machineid eq MS00
    """

    if not site:
        site = get_site()
    if site not in basedir_map:
        raise ValueError(site)
    basedir = basedir_map[site]

    machineid, runid, flowcellid = get_machine_run_flowcell_id(
        runid_and_flowcellid)

    if machineid.startswith('MS00'):
        # FIXME untested and unclear for NSCC
        rundir = "{}/{}/MiSeqOutput/{}_{}".format(basedir, machineid, runid, flowcellid)
    else:
        rundir = "{}/{}/{}_{}".format(basedir, machineid, runid, flowcellid)

    return rundir
Exemple #2
0
def get_sample_info(child, rows, mux_analysis_list, mux_id, fastq_data_dir, \
    run_num_flowcell, sample_info):
    """Collects sample info from ELM JOSN
    """
    sample_cfg = {}
    site = get_site()
    ctime, _ = generate_window(1)
    _, _, flowcellid = get_machine_run_flowcell_id(run_num_flowcell)
    mux_analysis_list.add(mux_id)
    sample_id = child['libraryId']
    sample_cfg['requestor'] = rows['requestor']
    sample_cfg['ctime'] = ctime
    sample_cfg['site'] = site
    try:
        sample_cfg['pipeline_name'] = legacy_mapper['pipeline_mapper'][
            child['Analysis']]
    except KeyError as e:
        sample_cfg['pipeline_name'] = child['Analysis']
        logger.warning(str(e) + " Pipeline not mappped to newer version")
        return sample_info
    pipeline_version = get_pipeline_version(child['pipeline_version'] \
        if 'pipeline_version' in rows else None)
    sample_cfg['pipeline_version'] = pipeline_version
    #sample_cfg['pipeline_params'] = 'params'
    ref_info = get_reference_info(child['Analysis'], \
        sample_cfg['pipeline_version'], child['genome'])
    if not ref_info:
        logger.info("ref_info not available")
        return sample_info
    cmdline_info = get_cmdline_info(child)
    sample_cfg['references_cfg'] = ref_info
    if cmdline_info:
        sample_cfg['cmdline'] = cmdline_info

    readunits_dict = {}
    status, fq1, fq2 = check_fastq(fastq_data_dir, child['libraryId'],\
        rows['laneId'])
    if status:
        ru = ReadUnit(run_num_flowcell, flowcellid, child['libraryId'],\
            rows['laneId'], None, fq1, fq2)
        k = key_for_readunit(ru)
        readunits_dict[k] = dict(ru._asdict())
        sample_cfg['readunits'] = readunits_dict
        if sample_info.get(sample_id, {}).get('readunits', {}):
            sample_info[sample_id]['readunits'].update(readunits_dict)
        else:
            sample_info[sample_id] = sample_cfg
    return sample_info
Exemple #3
0
def mongodb_conn(use_test_server=False):
    """Return connection to MongoDB server"""
    site = get_site()
    assert site in mongo_conns
    if use_test_server:
        logger.info("Using test MongoDB server")
        constr = mongo_conns[site]['test']
    else:
        logger.info("Using production MongoDB server")
        constr = mongo_conns[site]['production']

    try:
        connection = pymongo.MongoClient(constr)
    except pymongo.errors.ConnectionFailure:
        logger.fatal("Could not connect to the MongoDB server")
        return None
    logger.debug("Database connection established")
    return connection
Exemple #4
0
def mongodb_conn(use_test_server=False):
    """Return connection to MongoDB server"""
    site = get_site()
    assert site in mongo_conns
    if use_test_server:
        logger.info("Using test MongoDB server")
        constr = mongo_conns[site]['test']
    else:
        logger.info("Using production MongoDB server")
        constr = mongo_conns[site]['production']

    try:
        connection = pymongo.MongoClient(constr)
    except pymongo.errors.ConnectionFailure:
        logger.fatal("Could not connect to the MongoDB server")
        return None
    logger.debug("Database connection established")
    return connection
Exemple #5
0
def get_downstream_outdir(requestor, pipeline_version, pipeline_name, site=None, basedir_map=OUTDIR_BASE, base_pipelinedir_map=PRODUCTION_PIPELINE_VERSION):
    """generate downstream output directory
    """
    if not site:
        site = get_site()
    if site not in basedir_map:
        raise ValueError(site)
    if site not in base_pipelinedir_map:
        raise ValueError(site)
    if is_devel_version():
        basedir = basedir_map[site]['devel']
        if not pipeline_version:
            pipeline_version = base_pipelinedir_map[site]['devel']
    else:
        basedir = basedir_map[site]['production']
        if not pipeline_version:
            pipeline_version = os.readlink(base_pipelinedir_map[site]['production'])
    outdir = "{basedir}/{requestor}/{pversion}/{pname}/{ts}".format(
        basedir=basedir, requestor=requestor, pversion=pipeline_version, pname=pipeline_name,
        ts=generate_timestamp())
    return outdir
Exemple #6
0
def get_bcl2fastq_outdir(runid_and_flowcellid, site=None, basedir_map=OUTDIR_BASE):
    """FIXME:add-doc
    """

    if not site:
        site = get_site()
    if site not in basedir_map:
        raise ValueError(site)

    if is_devel_version():
        basedir = basedir_map[site]['devel']
    else:
        basedir = basedir_map[site]['production']

    machineid, runid, flowcellid = get_machine_run_flowcell_id(
        runid_and_flowcellid)

    outdir = "{basedir}/{mid}/{rid}_{fid}/bcl2fastq_{ts}".format(
        basedir=basedir, mid=machineid, rid=runid, fid=flowcellid,
        ts=generate_timestamp())
    return outdir
Exemple #7
0
def get_reference_info(analysis, pipeline_version, ref, site=None):
    """reference yaml for each library
    """
    if not site:
        site = get_site()
    basedir = legacy_mapper['cronjob_base'][site]
    if ref == 'human_g1k_v37':
        ref = 'b37'
    try:
        new_analysis = legacy_mapper['pipeline_mapper'][analysis]
    except KeyError as e:
        logger.warning(str(e) + " Pipeline not mappped to newer version")
        return None
    ref_info = glob.glob(os.path.join(basedir, pipeline_version, \
        new_analysis, 'cfg', '*' + ref+'*.yaml'))
    if ref_info:
        with open(ref_info[0], 'r') as f:
            try:
                doc = yaml.load(f)
                return doc
            except yaml.YAMLError as exc:
                print(exc)
                return None
Exemple #8
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-c', "--config",
                        help="Config file (YAML) listing samples and readunits."
                        " Collides with -1, -2 and -s")
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (must not exist)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")

    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with -c.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with -c.")
    fake_pipeline_handler = PipelineHandler("FAKE", PIPELINE_BASEDIR, "FAKE", None)
    default_cfg = fake_pipeline_handler.read_default_config()
    default = default_cfg['references']['genome']
    parser.add_argument('-r', "--reffa", default=default,
                        help=argparse.SUPPRESS)
                        # WARN do not change. this is just to set args.reffa (used later).
                        # any change here would require changes in dbsnp, hapmap, g1k, omni and mills as well
    parser.add_argument('-t', "--seqtype", required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l', "--intervals",
                        help="Intervals file (e.g. bed file) listing regions of interest."
                        " Required for WES and targeted sequencing.")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.config:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.config):
            logger.fatal("Config file %s does not exist", args.config)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.config)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    if args.seqtype in ['WES', 'targeted']:
        if not args.intervals:
            logger.fatal("Analysis of exome and targeted sequence runs requires a bed file")
            sys.exit(1)
        else:
            if not os.path.exists(args.intervals):
                logger.fatal("Intervals file %s does not exist", args.config)
                sys.exit(1)
            logger.warning("Compatilibity between interval file and"
                           " reference not checked")# FIXME

    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    user_data['readunits'] = readunits
    user_data['samples'] = samples

    user_data['num_chroms'] = len(list(chroms_and_lens_from_from_fasta(args.reffa)))
    user_data['seqtype'] = args.seqtype
    user_data['intervals'] = args.intervals# always safe, might be used for WGS as well
    user_data['mark_dups'] = MARK_DUPS

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data, site=site,
        master_q=args.master_q, slave_q=args.slave_q)
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemple #9
0
def get_lib_details(run_num_flowcell, mux_list, testing):
    """Lib info collection from ELM per run
    """
    _, run_num, flowcellid = get_machine_run_flowcell_id(run_num_flowcell)
    # Call rest service to get component libraries
    if testing:
        print(run_num)
        rest_url = rest_services['run_details']['testing'].replace("run_num", run_num)
        logger.info("development server")
    else:
        rest_url = rest_services['run_details']['production'].replace("run_num", run_num)
        logger.info("production server")
    response = requests.get(rest_url)
    if response.status_code != requests.codes.ok:
        response.raise_for_status()
    rest_data = response.json()
    logger.debug("rest_data from %s: %s", rest_url, rest_data)
    sample_info = {}
    if rest_data.get('runId') is None:
        logger.info("JSON data is empty for run num %s", run_num)
        return sample_info
    for mux_id, out_dir in mux_list:
        fastq_data_dir = os.path.join(out_dir[0], 'out', "Project_"+mux_id)
        if os.path.exists(fastq_data_dir):
            for rows in rest_data['lanes']:
                if mux_id in rows['libraryId']:
                    if "MUX" in rows['libraryId']:
                        for child in rows['Children']:
                            if child['Analysis'] != "Sequence only":
                                ctime, _ = generate_window(1)
                                sample_dict = {}
                                sample = child['libraryId']
                                sample_dict['requestor'] = rows['requestor']
                                sample_dict['ctime'] = ctime
                                sample_dict['pipeline_name'] = child['Analysis']
                                if 'pipeline_version' in rows:
                                    sample_dict['pipeline_version'] = child['pipeline_version']
                                else:
                                    sample_dict['pipeline_version'] = None
                                sample_dict['pipeline_params'] = 'params'
                                sample_dict['site'] = get_site()
                                out_dir = get_downstream_outdir(sample_dict['requestor'], \
                                    sample_dict['pipeline_version'], sample_dict['pipeline_name'])
                                sample_dict['out_dir'] = out_dir
                                readunits_dict = {}
                                status, fq1, fq2 = check_fastq(fastq_data_dir, child['libraryId'],\
                                    rows['laneId'])
                                if status:
                                    ru = ReadUnit(run_num_flowcell, flowcellid, child['libraryId'],\
                                        rows['laneId'], None, fq1, fq2)
                                    k = key_for_read_unit(ru)
                                    readunits_dict[k] = dict(ru._asdict())
                                    sample_dict['readunits'] = readunits_dict
                                    if sample_info.get(sample, {}).get('readunits'):
                                        sample_info[sample]['readunits'].update(readunits_dict)
                                    else:
                                        sample_info[sample] = sample_dict
                    else:
                        if rows['Analysis'] != "Sequence only":
                            sample = rows['libraryId']
                            status, fq1, fq2 = check_fastq(fastq_data_dir, rows['libraryId'], \
                                rows['laneId'])
                            if status:
                                ctime, _ = generate_window(1)
                                sample_dict = {}
                                readunits_dict = {}
                                ru = ReadUnit(run_num_flowcell, flowcellid, rows['libraryId'], \
                                    rows['laneId'], None, fq1, fq2)
                                k = key_for_read_unit(ru)
                                readunits_dict[k] = dict(ru._asdict())
                                sample_dict['readunits'] = readunits_dict
                                sample_info[sample] = sample_dict
    return sample_info
Exemple #10
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-c', "--config",
                        help="Config file (YAML) listing samples and readunits."
                        " Collides with -1, -2 and -s")
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (must not exist)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")

    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with -c.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with -c.")
    parser.add_argument('-C', "--cuffdiff", action='store_true',
                        dest="run_cuffdiff",
                        help="Also run cuffdiff")
    parser.add_argument('-S', '--stranded', action='store_true',
                        help="Stranded library prep (default is unstranded)")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.config:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.config):
            logger.fatal("Config file %s does not exist", args.config)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.config)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME checks on reffa index (currently not exposed via args)


    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    user_data['readunits'] = readunits
    user_data['samples'] = samples

    user_data['stranded'] = args.stranded
    user_data['run_cuffdiff'] = args.run_cuffdiff
    user_data['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if user_data['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data, site=site,
        master_q=args.master_q, slave_q=args.slave_q)
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemple #11
0
def main():
    """main function
    """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '-n',
        "--dryrun",
        action='store_true',
        help="Don't actually update DB (best used in conjunction with -v -v)")
    parser.add_argument('-t',
                        "--testing",
                        action='store_true',
                        help="Use MongoDB test-server. Don't do anything")
    default = 14
    parser.add_argument(
        '-w',
        '--win',
        type=int,
        default=default,
        help="Number of days to look back (default {})".format(default))
    parser.add_argument('-v',
                        '--verbose',
                        action='count',
                        default=0,
                        help="Increase verbosity")
    parser.add_argument('-q',
                        '--quiet',
                        action='count',
                        default=0,
                        help="Decrease verbosity")
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every
    LOGGER.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if not is_production_user():
        LOGGER.warning("Not a production user. Exiting")
        sys.exit(1)

    connection = mongodb_conn(args.testing)
    if connection is None:
        sys.exit(1)
    #LOGGER.info("Database connection established")
    dbcol = connection.gisds.pipeline_runs
    site = get_site()
    epoch_now, epoch_then = generate_window(args.win)
    cursor = dbcol.find({
        "ctime": {
            "$gt": epoch_then,
            "$lt": epoch_now
        },
        "site": site
    })
    LOGGER.info("Looping through {} jobs".format(cursor.count()))
    for job in cursor:
        dbid = job['_id']
        # only set here to avoid code duplication below
        try:
            out_dir = job['execution']['out_dir']
        except KeyError:
            out_dir = None

        # no execution dict means start a new analysis
        if not job.get('execution'):
            LOGGER.info('Job {} to be started'.format(dbid))
            # determine out_dir and set in DB
            # out_dir_override will take precedence over generating out_dir with get_downstream_outdir function
            if job.get('out_dir_override'):
                out_dir = job.get('out_dir_override')
                if os.path.exists(out_dir):
                    mux = os.path.basename(out_dir)
                    if not args.dryrun:
                        LOGGER.critical(
                            "Analysis for {} already exists under {}. Please start the analysis manually"
                            .format(mux, out_dir))
                        res = dbcol.update_one(
                            {"_id": ObjectId(dbid)},
                            {"$set": {
                                "execution.status": "MANUAL"
                            }})
                        assert res.modified_count == 1, (
                            "Modified {} documents instead of 1".format(
                                res.modified_count))
                        sys.exit(1)
                #assert not os.path.exists(out_dir), ("Direcotry already exists {}").format(out_dir)
            else:
                out_dir = get_downstream_outdir(job['requestor'],
                                                job['pipeline_name'],
                                                job['pipeline_version'])
            # Note, since execution (key) exists, accidental double
            # starts are prevented even before start time etc is
            # logged via flagfiles.  No active logging here so that
            # flag files logging just works.

            if args.dryrun:
                LOGGER.info("Skipping dry run option")
                continue
            status = start_cmd_execution(job, site, out_dir, args.testing)
            if status:
                res = dbcol.update_one(
                    {"_id": ObjectId(dbid)},
                    {"$set": {
                        "execution.out_dir": out_dir
                    }})
                assert res.modified_count == 1, (
                    "Modified {} documents instead of 1".format(
                        res.modified_count))
            else:
                LOGGER.warning("Job {} could not be started".format(dbid))
        elif job['execution'].get('status') == "MANUAL":
            continue
        elif list_starterflags(
                out_dir
        ):  # out_dir cannot be none because it's part of execution dict
            LOGGER.info(
                'Job {} in {} started but not yet logged as such in DB'.format(
                    dbid, out_dir))

            matches = list_starterflags(out_dir)
            assert len(matches) == 1, (
                "Got several starter flags in {}".format(out_dir))
            sflag = StarterFlag(matches[0])
            assert sflag.dbid == str(dbid)
            set_started(dbcol,
                        sflag.dbid,
                        str(sflag.timestamp),
                        dryrun=args.dryrun)
            os.unlink(sflag.filename)

        elif job['execution'].get('status') in ['STARTED', 'RESTART']:
            LOGGER.info(
                'Job %s in %s set as re|started so checking on completion',
                dbid, out_dir)
            set_completion_if(dbcol, dbid, out_dir, dryrun=args.dryrun)

        else:
            # job complete
            LOGGER.debug('Job %s in %s should be completed', dbid, out_dir)
    LOGGER.info("Successful program exit")
Exemple #12
0
def main():
    """main function
    """

    # FIXME ugly and code duplication in bcl2fastq_dbupdate.py
    mongo_status_script = os.path.abspath(os.path.join(
        os.path.dirname(sys.argv[0]), "mongo_status.py"))
    assert os.path.exists(mongo_status_script)

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))
    parser.add_argument('-r', "--runid",
                        help="Run ID plus flowcell ID (clashes with -d)")
    parser.add_argument('-d', "--rundir",
                        help="BCL input directory (clashes with -r)")
    parser.add_argument('-o', "--outdir",
                        help="Output directory (must not exist; required if called by user)")
    parser.add_argument('-t', "--testing", action='store_true',
                        help="Use MongoDB test server")
    parser.add_argument('--no-archive', action='store_true',
                        help="Don't archieve this analysis")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-l', '--lanes', type=int, nargs="*",
                        help="Limit run to given lane/s (multiples separated by space")
    parser.add_argument('-i', '--mismatches', type=int,
                        help="Max. number of allowed barcode mismatches (0>=x<=2)"
                        " setting a value here overrides the default settings read from ELM)")
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")


    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if args.mismatches is not None:
        if args.mismatches > 2 or args.mismatches < 0:
            logger.fatal("Number of mismatches must be between 0-2")
            sys.exit(1)

    lane_info = ''
    lane_nos = []
    if args.lanes:
        lane_info = '--tiles '
        for lane in args.lanes:
            if lane > 8 or lane < 1:
                logger.fatal("Lane number must be between 1-8")
                sys.exit(1)
            else:
                lane_info += 's_{}'.format(lane)+','
        lane_info = lane_info.rstrip()
        lane_info = lane_info[:-1]
        lane_nos = list(args.lanes)


    if args.runid and args.rundir:
        logger.fatal("Cannot use run-id and input directory arguments simultaneously")
        sys.exit(1)
    elif args.runid:
        rundir = run_folder_for_run_id(args.runid)
    elif args.rundir:
        rundir = os.path.abspath(args.rundir)
    else:
        logger.fatal("Need either run-id or input directory")
        sys.exit(1)
    if not os.path.exists(rundir):
        logger.fatal("Expected run directory {} does not exist".format(rundir))
    logger.info("Rundir is {}".format(rundir))

    if not args.outdir:
        outdir = get_bcl2fastq_outdir(args.runid)
    else:
        outdir = args.outdir
    if os.path.exists(outdir):
        logger.fatal("Output directory %s already exists", outdir)
        sys.exit(1)
    # create now so that generate_bcl2fastq_cfg.py can run
    os.makedirs(outdir)
    


    # catch cases where rundir was user provided and looks weird
    try:
        _, runid, flowcellid = get_machine_run_flowcell_id(rundir)
        run_num = runid + "_" + flowcellid
    except:
        run_num = "UNKNOWN-" + rundir.split("/")[-1]


    # call generate_bcl2fastq_cfg
    #
    # FIXME ugly assumes same directory (just like import above). better to import and run main()?
    generate_bcl2fastq = os.path.join(
        os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py")
    assert os.path.exists(generate_bcl2fastq)
    cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir]
    if args.testing:
        cmd.append("-t")
    logger.debug("Executing {}".format(' ' .join(cmd)))
    try:
        res = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        logger.fatal("The following command failed with return code {}: {}".format(
            e.returncode, ' '.join(cmd)))
        logger.fatal("Output: {}".format(e.output.decode()))
        logger.fatal("Exiting")
        sys.exit(1)
    # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it
    # use sys instead of logger to avoid double logging
    if res:
        sys.stderr.write(res.decode())

    # just created files
    muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG)
    samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV)
    usebases_cfg = os.path.join(outdir, USEBASES_CFG)

    # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files
    #
    if any([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]):
        # one missing means all should be missing
        assert all([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]])
        seqrunfailed(mongo_status_script, run_num, outdir, args.testing)
        sys.exit(0)


    # turn arguments into user_data that gets merged into pipeline config
    user_data = {'rundir': rundir,
                 'lanes_arg': lane_info,
                 'samplesheet_csv': samplesheet_csv,
                 'no_archive': args.no_archive,
                 'mail_on_completion': not args.no_mail,
                 'run_num': run_num}


    usebases_arg = ''
    with open(usebases_cfg, 'r') as stream:
        try:
            d = yaml.load(stream)
            assert 'usebases' in d
            assert len(d) == 1# make sure usebases is only key
            for ub in d['usebases']:
                #print (ub)
                usebases_arg += '--use-bases-mask {} '.format(ub)
            #user_data = {'usebases_arg' : usebases_arg}
        except yaml.YAMLError as exc:
            logger.fatal(exc)
            raise
    user_data['usebases_arg'] = usebases_arg
    os.unlink(usebases_cfg)


    mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos)
    if args.mismatches is not None:
        mux_units = [mu._replace(barcode_mismatches=args.mismatches)
                     for mu in mux_units]
    os.unlink(muxinfo_cfg)


    user_data['units'] = dict()
    for mu in mux_units:
        # special case: mux split across multiple lanes. make lanes a list
        # and add in extra lanes if needed.
        k = mu.mux_dir
        mu_dict = dict(mu._asdict())
        user_data['units'][k] = mu_dict

    # create mongodb update command, used later, after queueing
    mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, user_data['run_num'])
    mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh
    if args.testing:
        mongo_update_cmd += " -t"

    # NOTE: bcl2fastq has a special run template, so we need to
    # interfer with the default pipeline_handler.  plenty of
    # opportunity to shoot yourself in the foot

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR, outdir, user_data,
        site=site, master_q=args.master_q, slave_q=args.slave_q)
    # use local run template
    pipeline_handler.run_template = os.path.join(
        PIPELINE_BASEDIR, "run.template.{}.sh".format(pipeline_handler.site))
    assert os.path.exists(pipeline_handler.run_template)
    pipeline_handler.setup_env()
    # final mongo update line in run_out
    tmp_run_out = pipeline_handler.run_out + ".tmp"
    with open(pipeline_handler.run_out) as fh_in, \
         open(tmp_run_out, 'w') as fh_out:
        for line in fh_in:
            line = line.replace("@MONGO_UPDATE_CMD@", mongo_update_cmd)
            fh_out.write(line)
    shutil.move(tmp_run_out, pipeline_handler.run_out)
    pipeline_handler.submit(args.no_run)
Exemple #13
0
def main():
    """main function
    """

    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()))

    # generic args
    parser.add_argument('-o', "--outdir", required=True,
                        help="Output directory (must not exist)")
    parser.add_argument('--name',
                        help="Give this analysis run a name (used in email and report)")
    parser.add_argument('--no-mail', action='store_true',
                        help="Don't send mail on completion")
    site = get_site()
    default = DEFAULT_SLAVE_Q.get(site, None)
    parser.add_argument('-w', '--slave-q', default=default,
                        help="Queue to use for slave jobs (default: {})".format(default))
    default = DEFAULT_MASTER_Q.get(site, None)
    parser.add_argument('-m', '--master-q', default=default,
                        help="Queue to use for master job (default: {})".format(default))
    parser.add_argument('-n', '--no-run', action='store_true')
    parser.add_argument('-v', '--verbose', action='count', default=0,
                        help="Increase verbosity")
    parser.add_argument('-q', '--quiet', action='count', default=0,
                        help="Decrease verbosity")
    cfg_group = parser.add_argument_group('Configuration files (advanced)')
    cfg_group.add_argument('--prev-cfg',
                           help="Previously used config. Also used to infer path to precalculated BAM files")
    for name, descr in [("references", "reference sequences"),
                        ("params", "parameters"),
                        ("modules", "modules")]:
        default = os.path.abspath(os.path.join(CFG_DIR, "{}.yaml".format(name)))
        cfg_group.add_argument('--{}-cfg'.format(name),
                               default=default,
                               help="Config-file (yaml) for {}. (default: {})".format(descr, default))

    # pipeline specific args
    #parser.add_argument('-1', "--fq1", nargs="+",
    #                    help="FastQ file/s (gzip only)."
    #                    " Multiple input files supported (auto-sorted)."
    #                    " Note: each file (or pair) gets a unique read-group id."
    #                    " Collides with --sample-cfg.")
    #parser.add_argument('-2', "--fq2", nargs="+",
    #                    help="FastQ file/s (if paired) (gzip only). See also --fq1")
    #parser.add_argument('-s', "--sample",
    #                    help="Sample name. Collides with --sample-cfg.")
    #parser.add_argument('-t', "--seqtype", required=True,
    #                    choices=['WGS', 'WES', 'targeted'],
    #                    help="Sequencing type")
    #parser.add_argument('-l', "--intervals",
    #                    help="Intervals file (e.g. bed file) listing regions of interest."
    #                    " Required for WES and targeted sequencing.")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)


    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    #if args.sample_cfg:
    #    if any([args.fq1, args.fq2, args.sample]):
    #        logger.fatal("Config file overrides fastq and sample input arguments."
    #                     " Use one or the other")
    #        sys.exit(1)
    #    if not os.path.exists(args.sample_cfg):
    #        logger.fatal("Config file %s does not exist", args.sample_cfg)
    #        sys.exit(1)
    #    samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg)
    #else:
    #    if not all([args.fq1, args.sample]):
    #        logger.fatal("Need at least fq1 and sample without config file")
    #        sys.exit(1)
    #
    #    readunits = get_readunits_from_args(args.fq1, args.fq2)
    #    # all readunits go into this one sample specified on the command-line
    #    samples = dict()
    #    samples[args.sample] = list(readunits.keys())
    #
    #if args.seqtype in ['WES', 'targeted']:
    #    if not args.intervals:
    #        logger.fatal("Analysis of exome and targeted sequence runs requires a bed file")
    #        sys.exit(1)
    #    else:
    #        if not os.path.exists(args.intervals):
    #            logger.fatal("Intervals file %s does not exist", args.sample_cfg)
    #            sys.exit(1)
    #        logger.warning("Compatilibity between interval file and"
    #                       " reference not checked")# FIXME

    with open(args.prev_cfg, 'r') as stream:
        try:
            prev_cfg = yaml.load(stream)
        except yaml.YAMLError as exc:
            logger.fatal("Error loading %s", REST_CFG)
            raise
    #import pdb; pdb.set_trace()
    #sys.stderr.write("TMP DEBUG {}\n".format(prev_cfg))
    
    # turn arguments into user_data that gets merged into pipeline config
    #
    # generic data first
    user_data = dict()
    user_data['mail_on_completion'] = not args.no_mail
    #user_data['readunits'] = prev_cfg['readunits'] 
    user_data['readunits'] = dict()# None won't work
    #user_data['samples'] = samples
    user_data['samples'] = prev_cfg['samples']
    if args.name:
        user_data['analysis_name'] = args.name
    #user_data['seqtype'] = args.seqtype
    user_data['seqtype'] = 'WGS'# SG10K
    #user_data['intervals'] = args.intervals# always safe, might be used for WGS as well
    user_data['intervals'] = None#SG10K
    user_data['mark_dups'] = None# SG10K doesn't matter
    user_data['precalc_bam_dir'] = os.path.join(
        os.path.abspath(os.path.dirname(args.prev_cfg)), "out")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args.outdir, user_data, site=site,
        master_q=args.master_q,
        slave_q=args.slave_q,
        params_cfgfile=args.params_cfg,
        modules_cfgfile=args.modules_cfg,
        refs_cfgfile=args.references_cfg,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemple #14
0
def main(toaddr):
    subject = "Test email from {} version {}".format(get_site(),
                                                     get_pipeline_version())
    body = "Email wursts.\n\nSincerely,\nRPD"
    send_mail(subject, body, toaddr=toaddr, ccaddr=None, pass_exception=False)