Ejemplo n.º 1
0
    def __call__(self, target_dir):
        # do some STUFF
        d = Build38RealignmentDirectory(target_dir)
        validator = B38DirectoryValidator(d)
        if validator.valid_directory() or self.force:
            logger.info('Directory valid for processing')
            outdir = self.output_directory(d)
            logger.info('Output directory is {0}'.format(outdir))
            utils.force_make_dirs(outdir)

            stdout_dir = os.path.join(self.logdir, d.sample_name())
            utils.force_make_dirs(stdout_dir)

            # always submit a CRAM transfer because we use rsync
            # and it checks these things...
            copy_stdout = os.path.join(stdout_dir, 'cram_copy.log')
            cram_shortcutter = Shortcutter(d, outdir, '.cram_file_md5s.json',
                                           lambda x: x.cram_files())
            cram, crai = d.cram_files()
            new_cram = os.path.basename(cram)
            output_cram = os.path.join(outdir, new_cram)
            output_crai = output_cram + '.crai'
            if not (cram_shortcutter.can_shortcut(cram, output_cram)
                    and cram_shortcutter.can_shortcut(crai, output_crai)):
                cram_copy_cmd = RsyncCmd()
                cram_copy_cmdline = cram_copy_cmd(d.cram_file(), outdir)
                script_file = os.path.join(stdout_dir, 'cram_copy.sh')
                with open(script_file, 'w') as f:
                    f.write(cram_copy_cmdline + "\n")
                self.lsf_job_runner.launch(['/bin/bash', script_file],
                                           {'stdout': copy_stdout})

            shortcutter = Shortcutter(d, outdir, '.gvcf_file_md5s.json',
                                      lambda x: x.all_gvcf_files())
            for gvcf in d.all_gvcf_files():
                new_gzvcf = os.path.basename(gvcf)
                output_gzvcf = os.path.join(outdir, new_gzvcf)
                if not shortcutter.can_shortcut(gvcf, output_gzvcf):
                    cmd = RewriteGvcfCmd(
                        reference=
                        '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa',
                    )
                    cmdline = cmd(gvcf, output_gzvcf)
                    script_file = os.path.join(stdout_dir, new_gzvcf + '.sh')
                    with open(script_file, 'w') as f:
                        f.write(cmdline + "\n")
                    stdout = os.path.join(stdout_dir, new_gzvcf + '.log')
                    lsf_options = {
                        'stdout': stdout,
                    }
                    self.lsf_job_runner.launch(['/bin/bash', script_file],
                                               lsf_options)
            # Sync QC files
            qc_outdir = os.path.join(outdir, 'qc')
            utils.force_make_dirs(qc_outdir)
            self._qc_files(d, qc_outdir, stdout_dir)
            return outdir
        else:
            logger.warn('Invalid for processing')
            return None
Ejemplo n.º 2
0
    def __call__(self, target_dir):
        # do some STUFF
        d = Build38RealignmentDirectory(target_dir)
        validator = B38DirectoryValidator(d)
        if validator.valid_directory() or self.force:
            logger.info('Directory valid for processing')
            outdir = self.output_directory(d)
            logger.info('Output directory is {0}'.format(outdir))
            utils.force_make_dirs(outdir)

            stdout_dir = os.path.join(self.logdir, d.sample_name())
            utils.force_make_dirs(stdout_dir)

            # always submit a CRAM transfer because we use rsync
            # and it checks these things...
            copy_stdout = os.path.join(stdout_dir, 'cram_copy.log')
            cram_shortcutter = Shortcutter(d, outdir, '.cram_file_md5s.json',
                                           lambda x: x.cram_files())
            cram, crai = d.cram_files()
            new_cram = os.path.basename(cram)
            output_cram = os.path.join(outdir, new_cram)
            output_crai = output_cram + '.crai'
            if not (cram_shortcutter.can_shortcut(cram, output_cram)
                    and cram_shortcutter.can_shortcut(crai, output_crai)):
                cram_copy_cmd = RsyncCmd()
                cram_copy_cmdline = cram_copy_cmd(d.cram_file(), outdir)
                self.lsf_job_runner.launch(cram_copy_cmdline,
                                           {'stdout': copy_stdout})

            shortcutter = Shortcutter(d, outdir, '.gvcf_file_md5s.json',
                                      lambda x: x.all_gvcf_files())
            for gvcf in d.all_gvcf_files():
                new_gzvcf = os.path.basename(gvcf)
                output_gzvcf = os.path.join(outdir, new_gzvcf)
                if not shortcutter.can_shortcut(gvcf, output_gzvcf):
                    cmd = RewriteGvcfCmd(
                        java='/gapp/x64linux/opt/java/jdk/jdk1.8.0_60/bin/java',
                        max_mem='3500M',
                        max_stack='3500M',
                        gatk_jar=
                        '/gscmnt/gc2802/halllab/ccdg_resources/lib/GenomeAnalysisTK-3.5-0-g36282e4.jar',
                        reference=
                        '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa',
                        break_multiple=1000000)
                    cmdline = cmd(gvcf, output_gzvcf)
                    stdout = os.path.join(stdout_dir, new_gzvcf + '.log')
                    lsf_options = {
                        'stdout': stdout,
                    }
                    self.lsf_job_runner.launch(cmdline, lsf_options)
            # Sync QC files
            qc_outdir = os.path.join(outdir, 'qc')
            utils.force_make_dirs(qc_outdir)
            self._qc_files(d, qc_outdir, stdout_dir)
            return outdir
        else:
            logger.warn('Invalid for processing')
            return None
Ejemplo n.º 3
0
def call_svs(app, workorders):
    Session = open_db(app.database)
    for workorder in workorders:
        session = Session()
        for sample in session.query(ComputeWorkflowSample).filter(
                ComputeWorkflowSample.source_work_order == workorder):
            if (sample.analysis_cram_verifyed
                    and sample.analysis_sv_verified != True):
                if sample.analysis_sv_path is None:
                    sample.analysis_sv_path = os.path.join(
                        sample.analysis_cram_path, 'sv')
                directory = AnalysisDirectory(sample.analysis_gvcf_path)
                cram_file = directory.output_file_dict['*.cram'][0]
                filename = os.path.basename(cram_file)
                sample_name = filename.split('.cram')[0]

                sv_directory = AnalysisSvDirectory(sample.analysis_sv_path)
                complete = True
                if not sv_directory.staging_complete():
                    # stage directory
                    force_make_dirs(sample.analysis_sv_path)
                    force_symlink(
                        cram_file,
                        os.path.join(sample.analysis_sv_path, filename))
                    force_symlink(
                        cram_file + '.crai',
                        os.path.join(sample.analysis_sv_path,
                                     filename + '.crai'))
                os.chdir(sample.analysis_sv_path)
                if not sv_directory.cnvnator_complete():
                    # launch cnvnator
                    complete = False
                    print(
                        subprocess.check_output([
                            '/bin/bash',
                            '/gscuser/dlarson/src/internal-sv-pipeline/cnvnator_histogram.sh',
                            filename
                        ]))
                if not sv_directory.extract_complete():
                    # launch
                    complete = False
                    print(
                        subprocess.check_output([
                            '/bin/bash',
                            '/gscuser/dlarson/src/internal-sv-pipeline/extract_sv_reads.sh',
                            filename
                        ]))
                elif not sv_directory.lumpy_complete():
                    # launch
                    complete = False
                    subprocess.call([
                        '/bin/bash',
                        '/gscuser/dlarson/src/internal-sv-pipeline/lumpy.sh',
                        filename
                    ])
                elif not sv_directory.svtyper_complete():
                    complete = False
                    subprocess.call([
                        '/bin/bash',
                        '/gscuser/dlarson/src/internal-sv-pipeline/genotype.sh',
                        filename
                    ])
                sample.analysis_sv_verified = complete
                session.commit()
                if complete:
                    logger.info("{0} complete".format(sample_name))
        session.close()
Ejemplo n.º 4
0
def oldband(app, output_dir, workorders):
    os.environ['LSF_NO_INHERIT_ENVIRONMENT'] = 'true'
    default_job_options = {
        'memory_in_gb': 10,
        'queue': app.queue,
        'docker': 'registry.gsc.wustl.edu/genome/gatk-3.5-0-g36282e4:1',
    }
    if app.job_group is not None:
        default_job_options['group'] = app.job_group
    job_runner = LsfJob(default_job_options)

    logdir = os.path.join(output_dir, 'log')

    Session = open_db(app.database)
    cmd = OldbandandRewriteGvcfCmd(
        java='/usr/bin/java',
        max_mem='8G',
        max_stack='8G',
        gatk_jar='/opt/GenomeAnalysisTK.jar',
        reference=
        '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa',
        break_multiple=1000000)
    for wo in workorders:
        session = Session()
        for sample in session.query(ComputeWorkflowSample).filter(
                ComputeWorkflowSample.source_work_order == wo):
            if (sample.analysis_cram_verifyed):
                cram_path = sample.analysis_cram_path

                sample_name = os.path.basename(cram_path)
                cram_file = os.path.join(sample.analysis_cram_path,
                                         '{}.cram'.format(sample_name))

                oldband_path = os.path.join(sample.analysis_gvcf_path,
                                            'oldbanded_gvcfs')
                force_make_dirs(oldband_path)

                stdout_dir = os.path.join(logdir, sample_name)

                for chrom in chromosomes:
                    new_gzvcf = '{0}.{1}.g.vcf.gz'.format(sample_name, chrom)
                    output_gzvcf = os.path.join(oldband_path, new_gzvcf)
                    if not os.path.exists(output_gzvcf) or not os.path.exists(
                            output_gzvcf + '.tbi'):
                        stdout = os.path.join(stdout_dir,
                                              new_gzvcf + '.oldbanded.log')
                        cmdline = cmd(cram_file, output_gzvcf, chrom)
                        lsf_options = {
                            'stdout': stdout,
                        }
                        job_runner.launch(cmdline, lsf_options)

                # do ext
                chrom_string = ' -L '.join(ext_chromosomes)
                new_gzvcf = '{0}.extChr.g.vcf.gz'.format(sample_name)
                output_gzvcf = os.path.join(oldband_path, new_gzvcf)
                if not os.path.exists(output_gzvcf) or not os.path.exists(
                        output_gzvcf + '.tbi'):
                    script = os.path.join(oldband_path, 'oldband_extChr.sh')
                    cmdline = cmd(cram_file, output_gzvcf, chrom_string)
                    cmdline += ' && rm -f {0}'.format(script)
                    with open(script, 'w') as f:
                        f.write('#!/bin/bash\n')
                        f.write(cmdline)
                        f.write('\n')
                    stdout = os.path.join(stdout_dir,
                                          new_gzvcf + '.oldbanded.log')
                    lsf_options = {
                        'stdout': stdout,
                    }
                    job_runner.launch('/bin/bash {0}'.format(script),
                                      lsf_options)
Ejemplo n.º 5
0
 def __init__(self, dest_dir, job_runner, force=False):
     self.dest_dir = dest_dir
     self.logdir = os.path.join(self.dest_dir, 'log')
     self.lsf_job_runner = job_runner
     self.force = force
     utils.force_make_dirs(self.logdir)