Exemple #1
0
def ingest(app, csv_file, output_dir, force=False):
    Session = open_db(app.database)

    default_job_options = {
        'memory_in_gb': 5,
        'queue': app.queue,
        'docker': app.docker,
    }
    if app.job_group is not None:
        default_job_options['group'] = app.job_group

    preprocessor = B38Preprocessor(output_dir,
                                   job_runner=LsfJob(default_job_options),
                                   force=force)

    columns = {
        'Compute Workflow Execution': 'compute_workflow_execution',
        'Work Order': 'work_order',
        'DNA': 'ingest_sample_name',
        'WOI': 'woi',
        'Working Directory': 'source_directory'
    }
    seen = set()
    with open(csv_file) as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            output_json = dict()
            for key in columns:
                output_json[columns[key]] = row[key]
            seen_key = (output_json['source_directory'],
                        output_json['ingest_sample_name'],
                        output_json['work_order'])
            if seen_key in seen:
                logger.info(
                    'Duplicate row with identical source directory, sample name and workorder. Skipping...'
                )
                continue
            else:
                seen.add(seen_key)

            outdir = preprocessor(output_json['source_directory'])
            is_valid = False
            analysis_cram_path = None
            analysis_gvcf_path = None
            if outdir is not None:
                is_valid = True
                analysis_cram_path = outdir
                analysis_gvcf_path = outdir
            session = Session()
            session.add(
                ComputeWorkflowSample(
                    source_work_order=output_json['work_order'],
                    ingest_sample_name=output_json['ingest_sample_name'],
                    source_directory=output_json['source_directory'],
                    woi=output_json['woi'],
                    valid_source_directory=is_valid,
                    analysis_cram_path=analysis_cram_path,
                    analysis_gvcf_path=analysis_gvcf_path))
            session.commit()
Exemple #2
0
    def test2_lsf_job(self, subprocess_patch):
        laimsapp = LaimsApp()
        config = laimsapp.lsf_job_options()
        config.pop("queue", None)
        config.pop("stdout", None)
        print(config)
        job = LsfJob(config)
        self.assertTrue(isinstance(job, LsfJob))

        available_opts = LsfJob.available_options
        self.assertEqual(len(available_opts), 9,
                         "available options count is 9")

        expected_cmd = [
            'bsub', '-a', 'docker(registry.gsc.wustl.edu/mgi/laims:latest)',
            '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world'
        ]
        self.assertEqual(job.bsub_cmd(['echo', 'hello', 'world']),
                         expected_cmd)

        job.created_options["stdout"] = "/var/log/out"
        expected_cmd = [
            'bsub', '-M', '10000000', '-R',
            '"select[mem>10000] rusage[mem=10000]"', '-a',
            'docker(hello-world)', "-oo", "/var/log/out", '-N', '-u',
            '*****@*****.**', 'echo', 'hello', 'world'
        ]
        self.assertEqual(
            job.bsub_cmd(['echo', 'hello', 'world'], {
                "docker": "hello-world",
                "memory_in_gb": 10
            }), expected_cmd)

        job.created_options["stdout"] = "/var/log"
        expected_cmd = [
            'bsub', '-M', '10000000', '-R',
            '"select[mem>10000] rusage[mem=10000]"', '-a',
            'docker(hello-world)', "-oo", "/var/log/log1.out", '-N', '-u',
            '*****@*****.**', 'echo', 'hello', 'world'
        ]
        self.assertEqual(
            job.bsub_cmd(
                ['echo', 'hello', 'world'], {
                    "docker": "hello-world",
                    "memory_in_gb": 10,
                    "stdout_bn": "log1.out"
                }), expected_cmd)

        subprocess_patch.return_value = 1
        self.assertFalse(
            job.launch(['echo', 'hello', 'world'], {"docker": "hello-world"}),
            expected_cmd)
def verify_bulk_gvcfs(tsv_path, reference_path):
    os.environ['LSF_DOCKER_PRESERVE_ENVIRONMENT'] = 'false'

    job_opts = LaimsApp().lsf_job_options()
    job_opts["memory_in_gb"] = 10
    job_runner = LsfJob(job_opts)

    with open(tsv_path) as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            interval = get_interval_from_path(row[0])
            cmd = [
                "laims", "verify-gvcf", "--gvcf-path", row[0],
                "--reference-path", reference_path, "--interval", interval
            ]
            job_runner.launch(cmd,
                              cmd_options={
                                  "stdbn":
                                  ".".join([os.path.basename(row[0]), "out"])
                              })
Exemple #4
0
def oldband(app, output_dir, workorders):
    os.environ['LSF_NO_INHERIT_ENVIRONMENT'] = 'true'
    default_job_options = {
        'memory_in_gb': 10,
        'queue': app.queue,
        'docker': 'registry.gsc.wustl.edu/genome/gatk-3.5-0-g36282e4:1',
    }
    if app.job_group is not None:
        default_job_options['group'] = app.job_group
    job_runner = LsfJob(default_job_options)

    logdir = os.path.join(output_dir, 'log')

    Session = open_db(app.database)
    cmd = OldbandandRewriteGvcfCmd(
        java='/usr/bin/java',
        max_mem='8G',
        max_stack='8G',
        gatk_jar='/opt/GenomeAnalysisTK.jar',
        reference=
        '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa',
        break_multiple=1000000)
    for wo in workorders:
        session = Session()
        for sample in session.query(ComputeWorkflowSample).filter(
                ComputeWorkflowSample.source_work_order == wo):
            if (sample.analysis_cram_verifyed):
                cram_path = sample.analysis_cram_path

                sample_name = os.path.basename(cram_path)
                cram_file = os.path.join(sample.analysis_cram_path,
                                         '{}.cram'.format(sample_name))

                oldband_path = os.path.join(sample.analysis_gvcf_path,
                                            'oldbanded_gvcfs')
                force_make_dirs(oldband_path)

                stdout_dir = os.path.join(logdir, sample_name)

                for chrom in chromosomes:
                    new_gzvcf = '{0}.{1}.g.vcf.gz'.format(sample_name, chrom)
                    output_gzvcf = os.path.join(oldband_path, new_gzvcf)
                    if not os.path.exists(output_gzvcf) or not os.path.exists(
                            output_gzvcf + '.tbi'):
                        stdout = os.path.join(stdout_dir,
                                              new_gzvcf + '.oldbanded.log')
                        cmdline = cmd(cram_file, output_gzvcf, chrom)
                        lsf_options = {
                            'stdout': stdout,
                        }
                        job_runner.launch(cmdline, lsf_options)

                # do ext
                chrom_string = ' -L '.join(ext_chromosomes)
                new_gzvcf = '{0}.extChr.g.vcf.gz'.format(sample_name)
                output_gzvcf = os.path.join(oldband_path, new_gzvcf)
                if not os.path.exists(output_gzvcf) or not os.path.exists(
                        output_gzvcf + '.tbi'):
                    script = os.path.join(oldband_path, 'oldband_extChr.sh')
                    cmdline = cmd(cram_file, output_gzvcf, chrom_string)
                    cmdline += ' && rm -f {0}'.format(script)
                    with open(script, 'w') as f:
                        f.write('#!/bin/bash\n')
                        f.write(cmdline)
                        f.write('\n')
                    stdout = os.path.join(stdout_dir,
                                          new_gzvcf + '.oldbanded.log')
                    lsf_options = {
                        'stdout': stdout,
                    }
                    job_runner.launch('/bin/bash {0}'.format(script),
                                      lsf_options)
def downsample_and_recall(app, inputs, output_dir):
    log_dir = os.path.join(output_dir, 'logs')
    os.mkdir(log_dir)
    os.mkdir(os.path.join(output_dir, 'results'))

    cromwell_job_opts = {
        'memory_in_gb' : 32,
        'queue': app.queue,
        'docker': app.docker,
        'stdout': os.path.join(log_dir, 'cromwell.log'),
    }
    if app.job_group is not None: cromwell_job_opts['group'] = app.job_group
    job_runner=LsfJob(cromwell_job_opts)

    chrs = [ (["chr{}".format(c)]) for c in range(1,23) ]
    chrs.extend([
        ["chrX"],
        ["chrY"],
        ["/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.filtered-chromosome.ext.list"]
    ])


    workflow_inputs = {
        'reference': '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa',
        'downsample_strategy': 'ConstantMemory',
        'downsample_seed': 1,
        'emit_reference_confidence': 'GVCF',
        'max_alternate_alleles': 3,
        'variant_index_type': 'LINEAR',
        'variant_index_parameter': 128000,
        'read_filter': 'OverclippedRead',
        'intervals': chrs,
        'qc_minimum_mapping_quality': 0,
        'qc_minimum_base_quality': 0,
        'crams_to_downsample': [], #filled in from "inputs" file below
    }

    with open(inputs) as fh:
        reader = csv.reader(fh, delimiter='\t')
        for row in reader:
            sam = row[0]
            ratio = row[1]
            freemix = row[2]
            workflow_inputs['crams_to_downsample'].append(
                { 'cram': {'class': 'File', 'path': sam}, 'downsample_ratio': ratio, 'contamination': freemix }
            )

    input_yaml_path = os.path.join(output_dir, 'inputs.yaml')
    with open(input_yaml_path, 'w') as yaml_fh:
        yaml.dump(workflow_inputs, yaml_fh)

    config_template = os.path.join(LaimsApp().share_dir, 'cromwell.config.jinja')
    fs_loader = FileSystemLoader(searchpath = os.path.join(LaimsApp().share_dir))
    env = Environment(loader=fs_loader, autoescape=True)
    template = env.get_template('cromwell.config.jinja')

    cromwell_config_path = os.path.join(output_dir, 'cromwell.config')
    template.stream(
        log_dir=log_dir,
        output_dir=output_dir,
        lsf_queue=app.queue,
        lsf_job_group=app.job_group,
    ).dump(cromwell_config_path)

    cmd = [
        '/usr/bin/java', '-Dconfig.file=' + cromwell_config_path, '-Xmx24g', '-jar', '/opt/cromwell.jar', 'run',
        '-t', 'cwl', '-i', input_yaml_path, 'https://raw.githubusercontent.com/tmooney/cancer-genomics-workflow/downsample_and_recall/definitions/pipelines/gathered_downsample_and_recall.cwl' #TODO get a more canonical URL once things are merged
    ]
    job_runner.launch(cmd)