Example #1
0
def ingest(app, csv_file, output_dir, force=False):
    Session = open_db(app.database)

    default_job_options = {
        'memory_in_gb': 5,
        'queue': app.queue,
        'docker': app.docker,
    }
    if app.job_group is not None:
        default_job_options['group'] = app.job_group

    preprocessor = B38Preprocessor(output_dir,
                                   job_runner=LsfJob(default_job_options),
                                   force=force)

    columns = {
        'Compute Workflow Execution': 'compute_workflow_execution',
        'Work Order': 'work_order',
        'DNA': 'ingest_sample_name',
        'WOI': 'woi',
        'Working Directory': 'source_directory'
    }
    seen = set()
    with open(csv_file) as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            output_json = dict()
            for key in columns:
                output_json[columns[key]] = row[key]
            seen_key = (output_json['source_directory'],
                        output_json['ingest_sample_name'],
                        output_json['work_order'])
            if seen_key in seen:
                logger.info(
                    'Duplicate row with identical source directory, sample name and workorder. Skipping...'
                )
                continue
            else:
                seen.add(seen_key)

            outdir = preprocessor(output_json['source_directory'])
            is_valid = False
            analysis_cram_path = None
            analysis_gvcf_path = None
            if outdir is not None:
                is_valid = True
                analysis_cram_path = outdir
                analysis_gvcf_path = outdir
            session = Session()
            session.add(
                ComputeWorkflowSample(
                    source_work_order=output_json['work_order'],
                    ingest_sample_name=output_json['ingest_sample_name'],
                    source_directory=output_json['source_directory'],
                    woi=output_json['woi'],
                    valid_source_directory=is_valid,
                    analysis_cram_path=analysis_cram_path,
                    analysis_gvcf_path=analysis_gvcf_path))
            session.commit()
Example #2
0
def generate(app, workorders):
    Session = open_db(app.database)
    table = QcTable()
    for wo in workorders:
        session = Session()
        for sample in session.query(ComputeWorkflowSample).filter(
                ComputeWorkflowSample.source_work_order == wo):
            if (sample.analysis_cram_verifyed):
                qc_dir = QcDirectory(
                    os.path.join(sample.analysis_gvcf_path, 'qc'))
                if qc_dir.is_complete:
                    logger.info('Adding qc for {0}'.format(
                        sample.analysis_gvcf_path))
                    table.add(qc_dir.sample_name(), qc_dir,
                              sample.ingest_sample_name)
    table.write(sys.stdout)
Example #3
0
def check_analysis_dir(app):
    Session = open_db(app.database)
    session = Session()
    for sample in session.query(ComputeWorkflowSample):
        if (sample.analysis_gvcf_path is None
                or sample.analysis_gvcf_path is None):
            logger.warn(
                'No analysis directory in database for {0}. Is the source directory invalid?'
                .format(sample.source_directory))
            continue
        if (sample.analysis_cram_verifyed is None
                or not sample.analysis_cram_verifyed
                or sample.analysis_gvcfs_verified is None
                or not sample.analysis_gvcfs_verified):
            directory = AnalysisDirectory(sample.analysis_gvcf_path)
            is_complete = directory.is_complete
            sample.analysis_gvcfs_verified = is_complete
            sample.analysis_cram_verifyed = is_complete
            if not is_complete:
                # Print source_directory so we can attempt to process again
                logger.warn(
                    '{0} should be examined and a new attempt made to pre-process the gvcfs etc.'
                    .format(sample.source_directory))
            else:
                qc_directory = QcDirectory(
                    os.path.join(sample.analysis_gvcf_path, 'qc'))
                qc_synced = qc_directory.is_complete
                if not qc_synced:
                    logger.warn(
                        '{0} has a missing or incomplete qc directory. Attempt to resync.'
                        .format(sample.source_directory))
                else:
                    logger.info('{0} complete.'.format(
                        sample.source_directory))

    session.commit()
Example #4
0
def call_svs(app, workorders):
    Session = open_db(app.database)
    for workorder in workorders:
        session = Session()
        for sample in session.query(ComputeWorkflowSample).filter(
                ComputeWorkflowSample.source_work_order == workorder):
            if (sample.analysis_cram_verifyed
                    and sample.analysis_sv_verified != True):
                if sample.analysis_sv_path is None:
                    sample.analysis_sv_path = os.path.join(
                        sample.analysis_cram_path, 'sv')
                directory = AnalysisDirectory(sample.analysis_gvcf_path)
                cram_file = directory.output_file_dict['*.cram'][0]
                filename = os.path.basename(cram_file)
                sample_name = filename.split('.cram')[0]

                sv_directory = AnalysisSvDirectory(sample.analysis_sv_path)
                complete = True
                if not sv_directory.staging_complete():
                    # stage directory
                    force_make_dirs(sample.analysis_sv_path)
                    force_symlink(
                        cram_file,
                        os.path.join(sample.analysis_sv_path, filename))
                    force_symlink(
                        cram_file + '.crai',
                        os.path.join(sample.analysis_sv_path,
                                     filename + '.crai'))
                os.chdir(sample.analysis_sv_path)
                if not sv_directory.cnvnator_complete():
                    # launch cnvnator
                    complete = False
                    print(
                        subprocess.check_output([
                            '/bin/bash',
                            '/gscuser/dlarson/src/internal-sv-pipeline/cnvnator_histogram.sh',
                            filename
                        ]))
                if not sv_directory.extract_complete():
                    # launch
                    complete = False
                    print(
                        subprocess.check_output([
                            '/bin/bash',
                            '/gscuser/dlarson/src/internal-sv-pipeline/extract_sv_reads.sh',
                            filename
                        ]))
                elif not sv_directory.lumpy_complete():
                    # launch
                    complete = False
                    subprocess.call([
                        '/bin/bash',
                        '/gscuser/dlarson/src/internal-sv-pipeline/lumpy.sh',
                        filename
                    ])
                elif not sv_directory.svtyper_complete():
                    complete = False
                    subprocess.call([
                        '/bin/bash',
                        '/gscuser/dlarson/src/internal-sv-pipeline/genotype.sh',
                        filename
                    ])
                sample.analysis_sv_verified = complete
                session.commit()
                if complete:
                    logger.info("{0} complete".format(sample_name))
        session.close()
Example #5
0
def oldband(app, output_dir, workorders):
    os.environ['LSF_NO_INHERIT_ENVIRONMENT'] = 'true'
    default_job_options = {
        'memory_in_gb': 10,
        'queue': app.queue,
        'docker': 'registry.gsc.wustl.edu/genome/gatk-3.5-0-g36282e4:1',
    }
    if app.job_group is not None:
        default_job_options['group'] = app.job_group
    job_runner = LsfJob(default_job_options)

    logdir = os.path.join(output_dir, 'log')

    Session = open_db(app.database)
    cmd = OldbandandRewriteGvcfCmd(
        java='/usr/bin/java',
        max_mem='8G',
        max_stack='8G',
        gatk_jar='/opt/GenomeAnalysisTK.jar',
        reference=
        '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa',
        break_multiple=1000000)
    for wo in workorders:
        session = Session()
        for sample in session.query(ComputeWorkflowSample).filter(
                ComputeWorkflowSample.source_work_order == wo):
            if (sample.analysis_cram_verifyed):
                cram_path = sample.analysis_cram_path

                sample_name = os.path.basename(cram_path)
                cram_file = os.path.join(sample.analysis_cram_path,
                                         '{}.cram'.format(sample_name))

                oldband_path = os.path.join(sample.analysis_gvcf_path,
                                            'oldbanded_gvcfs')
                force_make_dirs(oldband_path)

                stdout_dir = os.path.join(logdir, sample_name)

                for chrom in chromosomes:
                    new_gzvcf = '{0}.{1}.g.vcf.gz'.format(sample_name, chrom)
                    output_gzvcf = os.path.join(oldband_path, new_gzvcf)
                    if not os.path.exists(output_gzvcf) or not os.path.exists(
                            output_gzvcf + '.tbi'):
                        stdout = os.path.join(stdout_dir,
                                              new_gzvcf + '.oldbanded.log')
                        cmdline = cmd(cram_file, output_gzvcf, chrom)
                        lsf_options = {
                            'stdout': stdout,
                        }
                        job_runner.launch(cmdline, lsf_options)

                # do ext
                chrom_string = ' -L '.join(ext_chromosomes)
                new_gzvcf = '{0}.extChr.g.vcf.gz'.format(sample_name)
                output_gzvcf = os.path.join(oldband_path, new_gzvcf)
                if not os.path.exists(output_gzvcf) or not os.path.exists(
                        output_gzvcf + '.tbi'):
                    script = os.path.join(oldband_path, 'oldband_extChr.sh')
                    cmdline = cmd(cram_file, output_gzvcf, chrom_string)
                    cmdline += ' && rm -f {0}'.format(script)
                    with open(script, 'w') as f:
                        f.write('#!/bin/bash\n')
                        f.write(cmdline)
                        f.write('\n')
                    stdout = os.path.join(stdout_dir,
                                          new_gzvcf + '.oldbanded.log')
                    lsf_options = {
                        'stdout': stdout,
                    }
                    job_runner.launch('/bin/bash {0}'.format(script),
                                      lsf_options)