Ejemplo n.º 1
0
def spread(in_fastqs,
           out_fastqs,
           tech='docker',
           input_parameters={},
           remove_infiles=False):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = list(in_fastqs) + list(out_fastqs)
    spread_line, fileDict = container.container_params(
        input_parameters['somaticseq_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    infastq_string = ' '.join(
        [fileDict[file_i]['mount_path'] for file_i in in_fastqs])
    outfastq_string = ' '.join(
        [fileDict[file_i]['mount_path'] for file_i in out_fastqs])

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: picard_fractional uses this to end the copying.

        out.write(f'{spread_line} \\\n')
        out.write(
            'concat.py -spread -bgzip -nt {} -infiles {} -outfiles {} \n'.
            format(input_parameters['threads'], infastq_string,
                   outfastq_string))

        if remove_infiles:
            out.write('rm {}\n\n'.format(' '.join(in_fastqs)))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 2
0
def gz(infiles,
       outfq,
       tech='docker',
       input_parameters=DEFAULT_PARAMS,
       remove_infiles=False):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = list(infiles) + [
        outfq,
    ]
    tabix_line, fileDict = container.container_params(
        input_parameters['tabix_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    mounted_outfile = fileDict[outfq]['mount_path']
    infile_string = ' '.join(
        [fileDict[file_i]['mount_path'] for file_i in infiles])

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: picard_fractional uses this to end the copying.

        out.write(f'{tabix_line} bash -c \\\n')
        out.write('"zcat {} | bgzip -@{} > {}"\n'.format(
            infile_string, input_parameters['threads'], mounted_outfile))

        if remove_infiles:
            out.write('rm {}\n\n'.format(' '.join(infiles)))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 3
0
def tumor_normal(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['vardict_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    minVAF = input_parameters['minimum_VAF']

    total_bases = 0
    num_lines = 0

    if input_parameters['inclusion_region']:

        bed_file = input_parameters['inclusion_region']

        with open(bed_file) as bed:
            line_i = bed.readline().rstrip()
            while line_i.startswith('track'):
                line_i = bed.readline().rstrip()
            while line_i:
                item = line_i.rstrip().split('\t')
                total_bases = total_bases + int(item[2]) - int(item[1])
                num_lines += 1
                line_i = bed.readline().rstrip()

    else:

        fai_file = input_parameters['genome_reference'] + '.fai'
        bed_file = os.path.join(input_parameters['output_directory'],
                                'genome.bed')

        with open(fai_file) as fai, open(bed_file, 'w') as wgs_bed:
            for line_i in fai:

                item = line_i.split('\t')

                total_bases += int(item[1])
                num_lines += 1

                wgs_bed.write('{}\t{}\t{}\n'.format(item[0], '0', item[1]))

    # However the "bed_file" is defined here, create a dockered line and mount dictionary for it:
    bed_split_line, bedDict = container.container_params(
        'lethalfang/somaticseq:{}'.format(VERSION), tech,
        (bed_file, input_parameters['output_directory']))

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_bed = bedDict[bed_file]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        # Decide if Bed file needs to be "split" such that each line has a small enough region
        if input_parameters['process_bed'] or total_bases / num_lines > 50000:
            out.write(f'{bed_split_line} \\\n')
            out.write('/opt/somaticseq/utilities/split_mergedBed.py \\\n')
            out.write('-infile {} -outfile {}/split_regions.bed\n\n'.format(
                mounted_bed,
                bedDict[input_parameters['output_directory']]['mount_path']))

            bed_file = '{}/split_regions.bed'.format(mounted_outdir)

        out.write(f'{container_line} bash -c \\\n')
        out.write('"/opt/VarDict-1.7.0/bin/VarDict \\\n')

        if input_parameters['vardict_arguments']:
            out.write('{} \\\n'.format(input_parameters['vardict_arguments']))

        out.write('-G {} \\\n'.format(mounted_genome_reference))
        out.write('-f {} -h \\\n'.format(minVAF))
        out.write('-b \'{}|{}\' \\\n'.format(mounted_tumor_bam,
                                             mounted_normal_bam))
        out.write('-Q 1 -c 1 -S 2 -E 3 -g 4 {} \\\n'.format(bed_file))
        out.write('> {}/vardict.var"\n\n'.format(mounted_outdir))

        out.write('\n')

        out.write(f'{container_line} \\\n')
        out.write(
            'bash -c "cat {}/vardict.var | awk \'NR!=1\' | /opt/VarDict/testsomatic.R | /opt/VarDict/var2vcf_paired.pl -N \'TUMOR|NORMAL\' -f {} \\\n'
            .format(mounted_outdir, minVAF))
        out.write('> {}/{}"\n\n'.format(mounted_outdir,
                                        input_parameters['outfile']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 4
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['dbsnp_gz'])
    assert os.path.exists(input_parameters['dbsnp_gz'] + '.tbi')

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['tumor_bam'], input_parameters[
            'normal_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters[
                        'inclusion_region'], input_parameters['dbsnp_gz']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['lofreq_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_inclusion = fileDict[
        input_parameters['inclusion_region']]['mount_path']
    mounted_dbsnp_gz = fileDict[input_parameters['dbsnp_gz']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{container_line} \\\n')
        out.write('lofreq somatic \\\n')
        out.write('-t {} \\\n'.format(mounted_tumor_bam))
        out.write('-n {} \\\n'.format(mounted_normal_bam))
        out.write('--call-indels \\\n')
        out.write('-l {} \\\n'.format(mounted_inclusion))
        out.write('-f {} \\\n'.format(mounted_genome_reference))
        out.write('-o {}/{} \\\n'.format(mounted_outdir,
                                         input_parameters['out_prefix']))

        if input_parameters['lofreq_arguments']:
            out.write('{} \\\n'.format(input_parameters['lofreq_arguments']))

        out.write('-d {}\n'.format(mounted_dbsnp_gz))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 5
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['dbsnp_gz'])
    assert os.path.exists(input_parameters['dbsnp_gz'] + '.tbi')

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters[
                        'inclusion_region'], input_parameters['dbsnp_gz']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['muse_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_dbsnp_gz = fileDict[input_parameters['dbsnp_gz']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(
            'cat {} | awk -F "\\t" \'{{print $1 "\\t" $2 "\\t" $3}}\' > {}/bed_3columns.bed\n\n'
            .format(input_parameters['inclusion_region'],
                    input_parameters['output_directory']))

        out.write(f'{container_line} \\\n')
        out.write('MuSEv1.0rc_submission_c039ffa call \\\n')
        out.write('-O {}/MuSE \\\n'.format(mounted_outdir))
        out.write('-l {}/bed_3columns.bed \\\n'.format(mounted_outdir))
        out.write('-f {} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write('{}\n\n'.format(mounted_normal_bam))

        out.write(f'{container_line} \\\n')
        out.write('MuSEv1.0rc_submission_c039ffa sump \\\n')
        out.write('-I {}/MuSE.MuSE.txt \\\n'.format(mounted_outdir))

        if input_parameters['exome']:
            out.write('-E \\\n')
        else:
            out.write('-G \\\n')

        if input_parameters['muse_arguments']:
            out.write('{} \\\n'.format(
                EXTRA_ARGS=input_parameters['muse_arguments']))

        out.write('-O {}/{} \\\n'.format(mounted_outdir,
                                         input_parameters['outfile']))
        out.write('-D {}\n'.format(mounted_dbsnp_gz))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 6
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['mutect2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])
    tumor_name_line, tumor_bam = container.container_params(
        'lethalfang/samtools:1.7', tech, (input_parameters['tumor_bam'], ))
    normal_name_line, normal_bam = container.container_params(
        'lethalfang/samtools:1.7', tech, (input_parameters['normal_bam'], ))

    # Resolve mounted paths
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    if input_parameters['inclusion_region']:
        mounted_inclusion = fileDict[
            input_parameters['inclusion_region']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        tumor_bam_path = tumor_bam[input_parameters['tumor_bam']]['mount_path']
        tumor_sample_name_extraction = f'tumor_name=`{tumor_name_line} samtools view -H {tumor_bam_path} | egrep -w \'^@RG\' | grep -Po \'SM:[^\\t$]+\' | sed \'s/SM://\' | uniq | sed -e \'s/[[:space:]]*$//\'`\n'
        out.write(tumor_sample_name_extraction)

        normal_bam_path = normal_bam[
            input_parameters['normal_bam']]['mount_path']
        normal_sample_name_extraction = f'normal_name=`{normal_name_line} samtools view -H {normal_bam_path} | egrep -w \'^@RG\' | grep -Po \'SM:[^\\t$]+\' | sed \'s/SM://\' | uniq | sed -e \'s/[[:space:]]*$//\'`\n'
        out.write(normal_sample_name_extraction)

        out.write('\n')

        out.write(f'{container_line} \\\n')
        out.write('java -Xmx{} -jar /gatk/gatk.jar Mutect2 \\\n'.format(
            input_parameters['MEM']))
        out.write(f'--reference {mounted_genome_reference} \\\n')

        if input_parameters['inclusion_region']:
            out.write('--intervals {} \\\n'.format(mounted_inclusion))

        out.write('--input {} \\\n'.format(mounted_tumor_bam))
        out.write('--input {} \\\n'.format(mounted_normal_bam))

        out.write('--normal-sample ${normal_name} \\\n')
        out.write('--tumor-sample ${tumor_name} \\\n')
        out.write('--native-pair-hmm-threads {} \\\n'.format(1))

        if input_parameters['mutect2_arguments']:
            out.write('{} \\\n'.format(input_parameters['mutect2_arguments']))

        out.write('--output {}/unfiltered.{}\n\n'.format(
            mounted_outdir, input_parameters['outfile']))

        out.write(f'{container_line} \\\n')
        out.write(
            'java -Xmx{} -jar /gatk/gatk.jar FilterMutectCalls \\\n'.format(
                input_parameters['MEM']))
        out.write('--variant {}/unfiltered.{} \\\n'.format(
            mounted_outdir, input_parameters['outfile']))

        if input_parameters['mutect2_filter_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['mutect2_filter_arguments']))

        out.write('--output {}/{}\n'.format(mounted_outdir,
                                            input_parameters['outfile']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 7
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker' ):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists( input_parameters['normal_bam'] )
    assert os.path.exists( input_parameters['tumor_bam'] )
    assert os.path.exists( input_parameters['genome_reference'] )    
    
    logdir  = os.path.join( input_parameters['output_directory'], 'logs' )
    outfile = os.path.join( logdir, input_parameters['script'] )

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters['tumor_bam'], input_parameters['genome_reference'], input_parameters['output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append( path_i )

    container_line, fileDict = container.container_params( input_parameters['varscan2_image'], tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] )
    mpileine_line,  plDict   = container.container_params( 'lethalfang/samtools:1.7',          tech=tech, files=all_paths, extra_args=input_parameters['extra_docker_options'] )
    
    
    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[ input_parameters['genome_reference'] ]['mount_path']
    mounted_tumor_bam        = fileDict[ input_parameters['tumor_bam'] ]['mount_path']
    mounted_normal_bam       = fileDict[ input_parameters['normal_bam'] ]['mount_path']
    mounted_outdir           = fileDict[ input_parameters['output_directory'] ]['mount_path']
    
    # Mounted paths for mpileup dockers
    pl_genome_reference = plDict[ input_parameters['genome_reference'] ]['mount_path']
    pl_tumor_bam        = plDict[ input_parameters['tumor_bam'] ]['mount_path']
    pl_normal_bam       = plDict[ input_parameters['normal_bam'] ]['mount_path']
    pl_outdir           = plDict[ input_parameters['output_directory'] ]['mount_path']


    if input_parameters['inclusion_region']:
        selector_text = '-l {}'.format( plDict[ input_parameters['inclusion_region'] ]['mount_path'] )
    else:
        selector_text = ''

    if input_parameters['minimum_VAF']:
        minVAF = input_parameters['minimum_VAF']

    outname = re.sub(r'\.[a-zA-Z]+$', '', input_parameters['outfile'] )

    with open(outfile, 'w') as out:
        
        out.write( "#!/bin/bash\n\n" )
        
        out.write(f'#$ -o {logdir}\n' )
        out.write(f'#$ -e {logdir}\n' )
        out.write( '#$ -S /bin/bash\n' )
        out.write( '#$ -l h_vmem={}\n'.format( input_parameters['MEM'] ) )
        out.write( 'set -e\n\n' )
        
        out.write( 'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n' )


        out.write(f'{mpileine_line} bash -c \\\n' )
        out.write( '"samtools mpileup \\\n' )
        out.write( '-B -q {minMQ} -Q {minBQ} {extra_pileup_arguments} {selector_text} -f \\\n'.format(minMQ=input_parameters['min_MQ'], minBQ=input_parameters['min_BQ'], extra_pileup_arguments=input_parameters['varscan_pileup_arguments'], selector_text=selector_text) )
        out.write( '{} \\\n'.format( pl_genome_reference ) )
        out.write( '{} \\\n'.format(pl_normal_bam) )
        out.write( '> {}/normal.pileup"\n\n'.format(pl_outdir))

        out.write(f'{mpileine_line} bash -c \\\n' )
        out.write( '"samtools mpileup \\\n' )
        out.write( '-B -q {minMQ} -Q {minBQ} {extra_pileup_arguments} {selector_text} -f \\\n'.format(minMQ=input_parameters['min_MQ'], minBQ=input_parameters['min_BQ'], extra_pileup_arguments=input_parameters['varscan_pileup_arguments'], selector_text=selector_text) )
        out.write( '{} \\\n'.format( pl_genome_reference ) )
        out.write( '{} \\\n'.format(pl_tumor_bam) )
        out.write( '> {}/tumor.pileup"\n\n'.format(pl_outdir) )

        
        out.write(f'{container_line} \\\n' )
        out.write( 'java -Xmx{} -jar /VarScan2.3.7.jar somatic \\\n'.format( input_parameters['MEM'] ) )
        out.write( '{}/normal.pileup \\\n'.format( mounted_outdir ) )
        out.write( '{}/tumor.pileup \\\n'.format( mounted_outdir ) )
        out.write( '{}/{} {} --output-vcf 1 --min-var-freq {}\n\n'.format(mounted_outdir, outname, input_parameters['varscan_arguments'], input_parameters['minimum_VAF'] ) )
                
        out.write(f'{container_line} \\\n' )
        out.write( 'java -Xmx{} -jar /VarScan2.3.7.jar processSomatic \\\n'.format(input_parameters['MEM']) )
        out.write( '{}/{}.snp.vcf\n\n'.format(mounted_outdir, outname) )
                
        out.write(f'{container_line} \\\n' )
        out.write( 'java -Xmx{} -jar /VarScan2.3.7.jar somaticFilter \\\n'.format(input_parameters['MEM']) )
        out.write( '{}/{}.snp.Somatic.hc.vcf \\\n'.format(mounted_outdir, outname) )
        out.write( '-indel-file {}/{}.indel.vcf \\\n'.format(mounted_outdir, outname) )
        out.write( '-output-file {}/{}.snp.Somatic.hc.filter.vcf\n\n'.format(mounted_outdir, outname) )
                
        out.write( 'rm {}/normal.pileup\n'.format( input_parameters['output_directory']) )
        out.write( 'rm {}/tumor.pileup\n'.format( input_parameters['output_directory']) )
        
        out.write( '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n' )
    
        
    # "Run" the script that was generated
    command_line = '{} {}'.format( input_parameters['action'], outfile )
    returnCode   = subprocess.call( command_line, shell=True )

    return outfile
Ejemplo n.º 8
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['reference_dict'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['reference_dict']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['jsm2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_reference_dict = fileDict[
        input_parameters['reference_dict']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')
        out.write('\n')

        out.write(f'{container_line} \\\n')
        out.write(
            '/opt/JointSNVMix-0.7.5/build/scripts-2.7/jsm.py train joint_snv_mix_two \\\n'
        )
        out.write('--convergence_threshold {} \\\n'.format(
            input_parameters['converge_threshold']))
        out.write('--skip_size {} \\\n'.format(input_parameters['skip_size']))

        if input_parameters['jsm_train_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['jsm_train_arguments']))

        out.write('{} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_normal_bam))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write('/opt/JointSNVMix-0.7.5/config/joint_priors.cfg \\\n')
        out.write('/opt/JointSNVMix-0.7.5/config/joint_params.cfg \\\n')
        out.write('{}/jsm.parameter.cfg\n'.format(mounted_outdir))
        out.write('\n')

        out.write('echo -e \'##fileformat=VCFv4.1\' > {}/{}\n'.format(
            input_parameters['output_directory'], input_parameters['outfile']))
        out.write(
            'echo -e \'##INFO=<ID=AAAB,Number=1,Type=Float,Description="Probability of Joint Genotype AA in Normal and AB in Tumor">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##INFO=<ID=AABB,Number=1,Type=Float,Description="Probability of Joint Genotype AA in Normal and BB in Tumor">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write(
            'echo -e \'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tNORMAL\\tTUMOR\' >> {}/{}\n'
            .format(input_parameters['output_directory'],
                    input_parameters['outfile']))
        out.write('\n')

        out.write(f'{container_line} bash -c \\\n')
        out.write(
            '"/opt/JointSNVMix-0.7.5/build/scripts-2.7/jsm.py classify joint_snv_mix_two \\\n'
        )

        if input_parameters['jsm_classify_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['jsm_classify_arguments']))

        out.write('{} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_normal_bam))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write("{}/jsm.parameter.cfg \\\n".format(mounted_outdir))
        out.write(
            '/dev/stdout | awk -F \'\\t\' \'NR!=1 && \\$4!=\\"N\\" && \\$10+\\$11>=0.95\' | \\\n'
        )
        out.write(
            'awk -F \'\\t\' \'{print \\$1 \\"\\t\\" \\$2 \\"\\t.\\t\\" \\$3 \\"\\t\\" \\$4 \\"\\t.\\t.\\tAAAB=\\" \\$10 \\";AABB=\\" \\$11 \\"\\tRD:AD\\t\\" \\$5 \\":\\" \\$6 \\"\\t\\" \\$7 \\":\\" \\$8}\' \\\n'
        )
        out.write('| /opt/vcfsorter.pl {} - >> {}/{}"\n\n'.format(
            mounted_reference_dict, mounted_outdir,
            input_parameters['outfile']))

        if input_parameters['threads'] > 1:

            bedtool_line, outdir_i = container.container_params(
                'lethalfang/bedtools:2.26.0', tech,
                (input_parameters['output_directory'], ))
            mounted_bed_outdir = outdir_i[
                input_parameters['output_directory']]['mount_path']

            out.write('\n\ni=1\n')
            out.write('while [[ $i -le {} ]]\n'.format(
                input_parameters['threads']))
            out.write('do\n')
            out.write(
                '    {DOCKER_LINE} bash -c "bedtools intersect -a {OUTDIR}/{OUTVCF} -b {OUTDIR}/${{i}}/${{i}}.bed -header | uniq > {OUTDIR}/${{i}}/{OUTVCF}"\n'
                .format(DOCKER_LINE=bedtool_line,
                        OUTDIR=mounted_bed_outdir,
                        OUTVCF=input_parameters['outfile']))
            out.write('    i=$(( $i + 1 ))\n')
            out.write('done\n')

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 9
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['somaticsniper_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    if input_parameters['inclusion_region']:
        mounted_inclusion = fileDict[
            input_parameters['inclusion_region']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{container_line} \\\n')
        out.write('/opt/somatic-sniper/build/bin/bam-somaticsniper \\\n')
        out.write('-q {} -Q {} -s {} -F vcf {} \\\n'.format(
            input_parameters['min_MQ'], input_parameters['somatic_score'],
            input_parameters['prior'],
            input_parameters['somaticsniper_arguments']))
        out.write('-f {} \\\n'.format(mounted_genome_reference))
        out.write('{} \\\n'.format(mounted_tumor_bam))
        out.write('{} \\\n'.format(mounted_normal_bam))
        out.write('{}/{}\n'.format(mounted_outdir,
                                   input_parameters['outfile']))

        if input_parameters['threads'] > 1:

            bedtool_line, outdir_i = container.container_params(
                'lethalfang/bedtools:2.26.0', tech,
                (input_parameters['output_directory'], ))
            mounted_bed_outdir = outdir_i[
                input_parameters['output_directory']]['mount_path']

            out.write('\n\ni=1\n')
            out.write('while [[ $i -le {} ]]\n'.format(
                input_parameters['threads']))
            out.write('do\n')
            out.write(
                '    {DOCKER_LINE} bash -c "bedtools intersect -a {OUTDIR}/{OUTVCF} -b {OUTDIR}/${{i}}/${{i}}.bed -header | uniq > {OUTDIR}/${{i}}/{OUTVCF}"\n'
                .format(DOCKER_LINE=bedtool_line,
                        OUTDIR=mounted_bed_outdir,
                        OUTVCF=input_parameters['outfile']))
            out.write('    i=$(( $i + 1 ))\n')
            out.write('done\n')

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 10
0
def picard(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_bam']:
        if path_i:
            all_paths.append(path_i)

    markdup_line, fileDict = container.container_params(
        input_parameters['picard_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])
    samtools_line, stDict = container.container_params(
        input_parameters['samtools_image'], tech, [
            input_parameters['output_directory'],
        ], input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_inbam = fileDict[input_parameters['in_bam']]['mount_path']

    tempdir = uuid.uuid4().hex
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: fractional uses this to end the copying.

        out.write('mkdir -p {}/{}\n\n'.format(
            input_parameters['output_directory'], tempdir))

        out.write(f'{markdup_line} \\\n')
        out.write(
            'java -Xmx{}G -jar /opt/picard.jar MarkDuplicatesWithMateCigar \\\n'
            .format(input_parameters['MEM']))
        out.write('I={} \\\n'.format(mounted_inbam))
        out.write('M={}/{} \\\n'.format(
            mounted_outdir,
            re.sub(
                r'\.(bam|cram)', '',
                fileDict[input_parameters['in_bam']]['filename'] +
                '.markdup')))
        out.write('ASSUME_SORT_ORDER=coordinate \\\n')
        out.write('TMP_DIR={}/{} \\\n'.format(mounted_outdir, tempdir))
        out.write('MINIMUM_DISTANCE=1000 \\\n')
        out.write('O={}/{}\n\n'.format(mounted_outdir,
                                       input_parameters['out_bam']))

        if input_parameters['index_bam']:
            out.write(f'{samtools_line} \\\n')
            out.write('samtools index -@{} {}/{}\n\n'.format(
                input_parameters['threads'],
                stDict[input_parameters['output_directory']]['mount_path'],
                input_parameters['out_bam']))

        out.write('rm -r {}/{}\n'.format(input_parameters['output_directory'],
                                         tempdir))

        out.write(
            '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n'
        )  # Do not change this: fractional uses this to end the copying.

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 11
0
def fractional(bed, input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    outdir = str(Path(bed).absolute().parent)

    logdir = os.path.join(outdir, 'logs')
    outfile = os.path.join(logdir, 'markdup_fractional.{}.cmd'.format(ts))
    os.makedirs(logdir, exist_ok=True)

    sambam_line, stDict = container.container_params(
        input_parameters['sambamba_image'], tech, [
            input_parameters['in_bam'],
            bed,
        ], input_parameters['extra_docker_options'])

    mounted_inbam = stDict[input_parameters['in_bam']]['mount_path']
    mounted_bed = stDict[bed]['mount_path']
    mounted_outdir = stDict[bed]['mount_dir']

    temp_split_bam = uuid.uuid4().hex + '.bam'
    split_deduped_bam = uuid.uuid4().hex + '.bam'
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{sambam_line} \\\n')
        out.write('sambamba view -L {} -t {} -f bam -o {} {}\n\n'.format(
            mounted_bed, 1, os.path.join(mounted_outdir, temp_split_bam),
            mounted_inbam))

        fractional_parameters = copy(input_parameters)
        fractional_parameters['output_directory'] = outdir
        fractional_parameters['in_bam'] = os.path.join(outdir, temp_split_bam)
        fractional_parameters['out_bam'] = split_deduped_bam
        fractional_parameters['script'] = 'to_be_deleted.{}.cmd'.format(ts)
        fractional_parameters['index_bam'] = False

        if input_parameters['software'] == 'picard':
            dedup_script = picard(fractional_parameters, tech)
        elif input_parameters['software'] == 'sambamba':

            fractional_parameters['threads'] = 2
            dedup_script = sambamba(fractional_parameters, tech)

        with open(os.path.join(logdir,
                               fractional_parameters['script'])) as dedup:

            line_i = dedup.readline()

            while not line_i.startswith('echo -e "Start'):
                line_i = dedup.readline()

            while not line_i.startswith('echo -e "Done'):
                out.write(line_i)
                line_i = dedup.readline()

        out.write('rm {}\n'.format(os.path.join(outdir, temp_split_bam)))
        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile, os.path.join(outdir, split_deduped_bam)
Ejemplo n.º 12
0
def sambamba(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_bam']:
        if path_i:
            all_paths.append(path_i)

    markdup_line, fileDict = container.container_params(
        input_parameters['sambamba_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_inbam = fileDict[input_parameters['in_bam']]['mount_path']

    tempdir = uuid.uuid4().hex
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: fractional uses this to end the copying.

        out.write('mkdir -p {}/{}\n\n'.format(
            input_parameters['output_directory'], tempdir))

        out.write(f'{markdup_line} \\\n')
        out.write('sambamba markdup -t {} --tmpdir {} {} {}\n\n'.format(
            input_parameters['threads'], os.path.join(mounted_outdir, tempdir),
            mounted_inbam,
            os.path.join(mounted_outdir, input_parameters['out_bam'])))

        out.write('rm -r {}/{}\n'.format(input_parameters['output_directory'],
                                         tempdir))

        out.write(
            '\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n'
        )  # Do not change this: fractional uses this to end the copying.

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 13
0
def tumor_normal(input_parameters, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])
    assert os.path.exists(input_parameters['reference_dict'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters[
                        'inclusion_region'], input_parameters[
                            'reference_dict']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['scalpel_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_reference_dict = fileDict[
        input_parameters['reference_dict']]['mount_path']
    mounted_inclusion = fileDict[
        input_parameters['inclusion_region']]['mount_path']

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{container_line} bash -c \\\n')
        out.write('"/opt/scalpel/scalpel-discovery --somatic \\\n')
        out.write('--ref {} \\\n'.format(mounted_genome_reference))
        out.write('--bed {} \\\n'.format(mounted_inclusion))
        out.write('--normal {} \\\n'.format(mounted_normal_bam))
        out.write('--tumor {} \\\n'.format(mounted_tumor_bam))
        out.write('--window 600 \\\n')

        if input_parameters['scalpel_two_pass']:
            out.write('--two-pass \\\n')

        if input_parameters['scalpel_discovery_arguments']:
            out.write('{} \\\n'.format(
                DISCOVERY_ARGS=input_parameters['scalpel_discovery_arguments'])
                      )

        out.write('--dir {}/scalpel && \\\n'.format(mounted_outdir))
        out.write('/opt/scalpel/scalpel-export --somatic \\\n')
        out.write(
            '--db {}/scalpel/main/somatic.db.dir \\\n'.format(mounted_outdir))
        out.write('--ref {} \\\n'.format(mounted_genome_reference))
        out.write('--bed {} \\\n'.format(mounted_inclusion))
        out.write('{} \\\n'.format(
            input_parameters['scalpel_export_arguments']))
        out.write('> {}/scalpel/scalpel.vcf"\n\n'.format(mounted_outdir))

        out.write(f'{container_line} bash -c \\\n')
        out.write(
            '"cat {}/scalpel/scalpel.vcf | /opt/vcfsorter.pl {} - \\\n'.format(
                mounted_outdir, mounted_reference_dict))
        out.write('> {}/{}\"\n'.format(mounted_outdir,
                                       input_parameters['outfile']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 14
0
def alienTrimmer(input_parameters, tech='docker'):

    if input_parameters['in_fastq2']:
        paired_end = True
    else:
        paired_end = False

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_fastq1'], input_parameters['in_fastq2']:
        if path_i:
            all_paths.append(path_i)

    trim_line, fileDict = container.container_params(
        input_parameters['alienTrimmerImage'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    temporary_files = []
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        # AlienTrimmer does not do bgzipped fastq files, unfortunately:
        if input_parameters['in_fastq1'].endswith('.gz'):

            out_fastq_1 = uuid.uuid4().hex + '.fastq'
            out_fastq_2 = uuid.uuid4().hex + '.fastq'

            if paired_end:
                tabix_line, tabixDict = container.container_params(
                    'lethalfang/tabix:1.7', tech,
                    (input_parameters['output_directory'],
                     input_parameters['in_fastq1'],
                     input_parameters['in_fastq2']))
            else:
                tabix_line, tabixDict = container.container_params(
                    'lethalfang/tabix:1.7', tech,
                    (input_parameters['output_directory'],
                     input_parameters['in_fastq1']))

            tabix_outdir = tabixDict[
                input_parameters['output_directory']]['mount_path']
            tabix_fq1 = tabixDict[input_parameters['in_fastq1']]['mount_path']

            out.write(f'{tabix_line} bash -c \\\n')
            out.write('"gunzip -c {} > {}/{}"\n\n'.format(
                tabix_fq1, tabix_outdir, out_fastq_1))
            mounted_fq1 = os.path.join(mounted_outdir, out_fastq_1)

            temporary_files.append(out_fastq_1)

            if paired_end:
                tabix_fq2 = tabixDict[
                    input_parameters['in_fastq2']]['mount_path']
                out.write(f'{tabix_line} bash -c \\\n')
                out.write('"gunzip -c {} > {}/{}"\n\n'.format(
                    tabix_fq2, tabix_outdir, out_fastq_2))
                mounted_fq2 = os.path.join(mounted_outdir, out_fastq_2)

                temporary_files.append(out_fastq_2)

        else:
            mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path']

            if paired_end:
                mounted_fq2 = fileDict[
                    input_parameters['in_fastq2']]['mount_path']

        out.write(f'{trim_line} \\\n')
        out.write('/opt/AlienTrimmer_0.4.0/src/AlienTrimmer \\\n')

        if paired_end:
            trimmed_fq1 = uuid.uuid4().hex + '.fastq'
            trimmed_fq2 = uuid.uuid4().hex + '.fastq'
            singleton = uuid.uuid4().hex + '.fastq'

            out.write('-if {} -ir {} \\\n'.format(mounted_fq1, mounted_fq2))
            out.write('-of {}/{} -or {}/{} \\\n'.format(
                mounted_outdir, trimmed_fq1, mounted_outdir, trimmed_fq2))
            out.write('-os {}/{} \\\n'.format(mounted_outdir, singleton))

            temporary_files.extend([trimmed_fq1, trimmed_fq2, singleton])

        else:
            trimmed_fq1 = uuid.uuid4().hex + '.fastq'
            out.write('-i {} \\\n'.write(mounted_fq1))
            out.write('-o {}/{} \\\n'.write(mounted_outdir, trimmed_fq1))

            temporary_files.append(trimmed_fq1)

        out.write('-c {} \\\n'.format(input_parameters['adapter']))
        out.write('-l {}\n\n'.format(input_parameters['minimum_length']))

        out.write(f'{tabix_line} bash -c \\\n')
        out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format(
            tabix_outdir, trimmed_fq1, input_parameters['threads'],
            tabix_outdir, input_parameters['out_fastq1_name']))

        if paired_end:
            out.write(f'{tabix_line} bash -c \\\n')
            out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format(
                tabix_outdir, trimmed_fq2, input_parameters['threads'],
                tabix_outdir, input_parameters['out_fastq2_name']))

            out.write(f'{tabix_line} bash -c \\\n')
            out.write('"cat {}/{} | bgzip -@{} > {}/{}"\n'.format(
                tabix_outdir, singleton, input_parameters['threads'],
                tabix_outdir, input_parameters['out_singleton_name']))

        out.write('\n')
        for file_i in temporary_files:
            out.write('rm {}\n'.format(
                os.path.join(input_parameters['output_directory'], file_i)))

        # Remove untrimmed files:
        if input_parameters['remove_untrimmed']:
            out.write('\n')
            out.write('rm {}\n'.format(input_parameters['in_fastq1']))

            if input_parameters['in_fastq2']:
                out.write('rm {}\n'.format(input_parameters['in_fastq2']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 15
0
def trimmomatic(input_parameters, tech='docker'):

    if input_parameters['in_fastq2']:
        paired_end = True
    else:
        paired_end = False

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'in_fastq1'], input_parameters['in_fastq2']:
        if path_i:
            all_paths.append(path_i)

    trim_line, fileDict = container.container_params(
        input_parameters['trimmomaticImage'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path']
    mounted_fq2 = fileDict[input_parameters['in_fastq2']]['mount_path']

    temporary_files = []
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{trim_line} \\\n')
        out.write(
            'java -Xmx{}G -jar /opt/Trimmomatic/trimmomatic.jar \\\n'.format(
                input_parameters['MEM']))

        if paired_end:
            out.write('PE -threads {} -phred33 \\\n'.format(
                input_parameters['threads']))
            out.write(
                '{FQ1} {FQ2} {DIR}/{PAIR1} {DIR}/{UNPAIR1} {DIR}/{PAIR2} {DIR}/{UNPAIR2} \\\n'
                .format(
                    FQ1=mounted_fq1,
                    FQ2=mounted_fq2,
                    DIR=mounted_outdir,
                    PAIR1=input_parameters['out_fastq1_name'],
                    PAIR2=input_parameters['out_fastq2_name'],
                    UNPAIR1='unpaired.' + input_parameters['out_fastq1_name'],
                    UNPAIR2='unpaired.' + input_parameters['out_fastq2_name']))

        else:
            out.write('SE -threads {} -phred33 \\\n'.format(
                input_parameters['threads']))
            out.write('{FQ1} {DIR}/{PAIR1} \\\n'.format(
                FQ1=mounted_fq1,
                DIR=mounted_outdir,
                PAIR1=input_parameters['out_fastq1_name']))

        out.write(
            'ILLUMINACLIP:{ADAPTER}:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:{MINLEN}\n'
            .format(ADAPTER=input_parameters['adapter'],
                    MINLEN=input_parameters['minimum_length']))

        # Remove untrimmed files:
        if input_parameters['remove_untrimmed']:
            out.write('\n')
            out.write('rm {}\n'.format(input_parameters['in_fastq1']))

            if input_parameters['in_fastq2']:
                out.write('rm {}\n'.format(input_parameters['in_fastq2']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 16
0
def run_SomaticSeq(input_parameters, tech='docker'):

    DEFAULT_PARAMS = {
        'MEM': '4G',
        'inclusion_region': None,
        'exclusion_region': None,
        'output_directory': os.curdir,
        'somaticseq_directory': 'SomaticSeq',
        'action': 'echo',
        'dbsnp': None,
        'cosmic': None,
        'snv_classifier': None,
        'indel_classifier': None,
        'truth_snv': None,
        'truth_indel': None,
        'somaticseq_arguments': '',
        'train_somaticseq': False,
        'somaticseq_algorithm': 'xgboost'
    }

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    all_paths = []
    for path_i in input_parameters['bam'], input_parameters[
            'genome_reference'], input_parameters[
                'output_directory'], input_parameters[
                    'inclusion_region'], input_parameters[
                        'exclusion_region'], input_parameters[
                            'dbsnp'], input_parameters[
                                'cosmic'], input_parameters[
                                    'snv_classifier'], input_parameters[
                                        'indel_classifier'], input_parameters[
                                            'truth_snv'], input_parameters[
                                                'truth_indel']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        f'lethalfang/somaticseq:{VERSION}',
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    outdir = os.path.join(input_parameters['output_directory'],
                          input_parameters['somaticseq_directory'])
    logdir = os.path.join(outdir, 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    mutect2 = '{}/MuTect2.vcf'.format(mounted_outdir)
    varscan = '{}/VarScan2.vcf'.format(mounted_outdir)
    vardict = '{}/VarDict.vcf'.format(mounted_outdir)
    lofreq = '{}/LoFreq.vcf'.format(mounted_outdir)
    scalpel = '{}/Scalpel.vcf'.format(mounted_outdir)
    strelka = '{}/Strelka/results/variants/variants.vcf.gz'.format(
        mounted_outdir)

    os.makedirs(logdir, exist_ok=True)
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        #out.write( 'docker pull lethalfang/somaticseq:{VERSION} \n\n'.format(VERSION=VERSION) )

        out.write(f'{container_line} \\\n')
        out.write('/opt/somaticseq/somaticseq/run_somaticseq.py \\\n')

        if input_parameters['train_somaticseq'] and input_parameters[
                'threads'] == 1:
            out.write('--somaticseq-train --algorithm {} \\\n'.format(
                input_parameters['somaticseq_algorithm']))

        out.write('--output-directory {} \\\n'.format(
            os.path.join(mounted_outdir,
                         input_parameters['somaticseq_directory'])))
        out.write(
            '--genome-reference {} \\\n'.format(mounted_genome_reference))

        if input_parameters['inclusion_region']:
            mounted_inclusion = fileDict[
                input_parameters['inclusion_region']]['mount_path']
            out.write('--inclusion-region {} \\\n'.format(mounted_inclusion))

        if input_parameters['exclusion_region']:
            mounted_exclusion = fileDict[
                input_parameters['exclusion_region']]['mount_path']
            out.write('--exclusion-region {} \\\n'.format(
                input_parameters['exclusion_region']))

        if input_parameters['cosmic']:
            mounted_cosmic = fileDict[input_parameters['cosmic']]['mount_path']
            out.write('--cosmic-vcf {} \\\n'.format(mounted_cosmic))

        if input_parameters['dbsnp']:
            mounted_dbsnp = fileDict[input_parameters['dbsnp']]['mount_path']
            out.write('--dbsnp-vcf {} \\\n'.format(
                input_parameters['dbsnp_vcf']))

        if input_parameters['snv_classifier'] or input_parameters[
                'indel_classifier']:
            out.write('--algorithm {} \\\n'.format(
                input_parameters['somaticseq_algorithm']))

            if input_parameters['snv_classifier']:
                out.write('--classifier-snv {} \\\n'.format(fileDict[
                    input_parameters['snv_classifier']]['mount_path']))

            if input_parameters['indel_classifier']:
                out.write('--classifier-indel {} \\\n'.format(fileDict[
                    input_parameters['indel_classifier']]['mount_path']))

        if input_parameters['truth_snv']:
            out.write('--truth-snv {} \\\n'.format(
                fileDict[input_parameters['truth_snv']]['mount_path']))

        if input_parameters['truth_indel']:
            out.write('--truth-indel {} \\\n'.format(
                fileDict[input_parameters['truth_indel']]['mount_path']))

        if input_parameters['somaticseq_algorithm']:
            out.write('--algorithm {} \\\n'.format(
                input_parameters['somaticseq_algorithm']))

        if input_parameters['somaticseq_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['somaticseq_arguments']))

        out.write('single \\\n')
        out.write('--bam-file  {} \\\n'.format(mounted_tumor_bam))

        if input_parameters['run_mutect2']:
            out.write('--mutect2-vcf {} \\\n'.format(mutect2))

        if input_parameters['run_varscan2']:
            out.write('--varscan-vcf {} \\\n'.format(varscan))

        if input_parameters['run_vardict']:
            out.write('--vardict-vcf {} \\\n'.format(vardict))

        if input_parameters['run_lofreq']:
            out.write('--lofreq-vcf {} \\\n'.format(lofreq))

        if input_parameters['run_scalpel']:
            out.write('--scalpel-vcf {} \\\n'.format(scalpel))

        if input_parameters['run_strelka2']:
            out.write('--strelka-vcf {} \\\n'.format(strelka))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 17
0
def merge_results(input_parameters, tech='docker'):

    DEFAULT_PARAMS = {
        'MEM': '4G',
        'output_directory': os.curdir,
        'somaticseq_directory': 'SomaticSeq',
        'action': 'echo',
        'script': 'mergeResults.{}.cmd'.format(ts),
        'snv_classifier': None,
        'indel_classifier': None,
        'truth_snv': None,
        'truth_indel': None,
        'somaticseq_arguments': '',
        'train_somaticseq': False,
        'somaticseq_algorithm': 'xgboost'
    }

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    all_paths = []
    for path_i in input_parameters['genome_reference'], input_parameters[
            'output_directory'], input_parameters[
                'snv_classifier'], input_parameters[
                    'indel_classifier'], input_parameters[
                        'truth_snv'], input_parameters['truth_indel']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        f'lethalfang/somaticseq:{VERSION}',
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    prjdir = input_parameters['output_directory']
    logdir = os.path.join(prjdir, 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    mutect2 = mounted_outdir + '/{}/MuTect2.vcf'
    varscan = mounted_outdir + '/{}/VarScan2.vcf'
    vardict = mounted_outdir + '/{}/VarDict.vcf'
    lofreq = mounted_outdir + '/{}/LoFreq.vcf'
    scalpel = mounted_outdir + '/{}/Scalpel.vcf'
    strelka = mounted_outdir + '/{}/Strelka/results/variants/variants.vcf.gz'

    somaticdir = input_parameters['somaticseq_directory']

    os.makedirs(logdir, exist_ok=True)
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        if input_parameters['run_mutect2']:
            out.write(f'{container_line} \\\n')
            out.write('concat.py --bgzip-output -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write(mutect2.format(i) + ' ')

            out.write('\\\n')
            out.write('-outfile {}/MuTect2.vcf\n\n'.format(mounted_outdir))

        if input_parameters['run_varscan2']:
            out.write(f'{container_line} \\\n')
            out.write('concat.py --bgzip-output -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write(varscan.format(i) + ' ')

            out.write('\\\n')
            out.write('-outfile {}/VarScan2.vcf\n\n'.format(mounted_outdir))

        if input_parameters['run_vardict']:
            out.write(f'{container_line} \\\n')
            out.write('concat.py --bgzip-output -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write(vardict.format(i) + ' ')

            out.write('\\\n')
            out.write('-outfile {}/VarDict.vcf\n\n'.format(mounted_outdir))

        if input_parameters['run_lofreq']:
            out.write(f'{container_line} \\\n')
            out.write('concat.py --bgzip-output -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write(lofreq.format(i) + ' ')

            out.write('\\\n')
            out.write('-outfile {}/LoFreq.vcf\n\n'.format(mounted_outdir))

        if input_parameters['run_scalpel']:
            out.write(f'{container_line} \\\n')
            out.write('concat.py --bgzip-output -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write(scalpel.format(i) + ' ')

            out.write('\\\n')
            out.write('-outfile {}/Scalpel.vcf\n\n'.format(mounted_outdir))

        if input_parameters['run_strelka2']:
            out.write(f'{container_line} \\\n')
            out.write('concat.py --bgzip-output -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write(strelka.format(i) + ' ')

            out.write('\\\n')
            out.write('-outfile {}/Strelka.vcf\n\n'.format(mounted_outdir))

        ###### SomaticSeq #####
        if input_parameters['run_somaticseq']:

            # Ensemble.sSNV.tsv
            out.write(f'{container_line} \\\n')
            out.write('concat.py -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write('{}/{}/{}/Ensemble.sSNV.tsv'.format(
                    mounted_outdir, i, somaticdir) + ' ')

            out.write('\\\n')
            out.write(
                '-outfile {}/Ensemble.sSNV.tsv\n\n'.format(mounted_outdir))

            # Ensemble.sINDEL.tsv
            out.write(f'{container_line} \\\n')
            out.write('concat.py -infiles \\\n')

            for i in range(1, input_parameters['threads'] + 1):
                out.write('{}/{}/{}/Ensemble.sINDEL.tsv'.format(
                    mounted_outdir, i, somaticdir) + ' ')

            out.write('\\\n')
            out.write(
                '-outfile {}/Ensemble.sINDEL.tsv\n\n'.format(mounted_outdir))

            # If asked to create classifier, do it here when TSV files are combined
            if input_parameters['train_somaticseq'] and input_parameters[
                    'truth_snv']:
                out.write(f'{container_line} \\\n')
                if input_parameters['somaticseq_algorithm'] == 'ada':
                    out.write(
                        'ada_model_builder_ntChange.R {}/Ensemble.sSNV.tsv\n\n'
                        .format(mounted_outdir))
                else:
                    out.write(
                        'somatic_xgboost.py train -threads {} -tsvs {}/Ensemble.sSNV.tsv\n\n'
                        .format(input_parameters['threads'], mounted_outdir))

            if input_parameters['train_somaticseq'] and input_parameters[
                    'truth_indel']:
                out.write(f'{container_line} \\\n')
                if input_parameters['somaticseq_algorithm'] == 'ada':
                    out.write(
                        'ada_model_builder_ntChange.R {}/Ensemble.sINDEL.tsv\n\n'
                        .format(mounted_outdir))
                else:
                    out.write(
                        'somatic_xgboost.py train -threads {} -tsvs {}/Ensemble.sINDEL.tsv\n\n'
                        .format(input_parameters['threads'], mounted_outdir))

            # If in prediction mode, combine SSeq.Classified.sSNV.vcf, else Consensus.sSNV.vcf
            if input_parameters['snv_classifier']:

                out.write(f'{container_line} \\\n')
                out.write('concat.py --bgzip-output -infiles \\\n')

                for i in range(1, input_parameters['threads'] + 1):
                    out.write('{}/{}/{}/SSeq.Classified.sSNV.vcf'.format(
                        mounted_outdir, i, somaticdir) + ' ')

                out.write('\\\n')
                out.write('-outfile {}/SSeq.Classified.sSNV.vcf\n\n'.format(
                    mounted_outdir))

                # SSeq.Classified.sSNV.tsv
                out.write(f'{container_line} \\\n')
                out.write('concat.py --bgzip-output -infiles \\\n')

                for i in range(1, input_parameters['threads'] + 1):
                    out.write('{}/{}/{}/SSeq.Classified.sSNV.tsv'.format(
                        mounted_outdir, i, somaticdir) + ' ')

                out.write('\\\n')
                out.write('-outfile {}/SSeq.Classified.sSNV.tsv\n\n'.format(
                    mounted_outdir))

            # Consensus mode: Consensus.sSNV.vcf
            else:
                out.write(f'{container_line} \\\n')
                out.write('concat.py --bgzip-output -infiles \\\n')

                for i in range(1, input_parameters['threads'] + 1):
                    out.write('{}/{}/{}/Consensus.sSNV.vcf'.format(
                        mounted_outdir, i, somaticdir) + ' ')

                out.write('\\\n')
                out.write('-outfile {}/Consensus.sSNV.vcf\n\n'.format(
                    mounted_outdir))

            # If in prediction mode, combine SSeq.Classified.sINDEL.vcf, else Consensus.sINDEL.vcf
            if input_parameters['indel_classifier']:

                out.write(f'{container_line} \\\n')
                out.write('concat.py --bgzip-output -infiles \\\n')

                for i in range(1, input_parameters['threads'] + 1):
                    out.write('{}/{}/{}/SSeq.Classified.sINDEL.vcf'.format(
                        mounted_outdir, i, somaticdir) + ' ')

                out.write('\\\n')
                out.write('-outfile {}/SSeq.Classified.sINDEL.vcf\n\n'.format(
                    mounted_outdir))

                # SSeq.Classified.sINDEL.tsv
                out.write(f'{container_line} \\\n')
                out.write('concat.py --bgzip-output -infiles \\\n')

                for i in range(1, input_parameters['threads'] + 1):
                    out.write('{}/{}/{}/SSeq.Classified.sINDEL.tsv'.format(
                        mounted_outdir, i, somaticdir) + ' ')

                out.write('\\\n')
                out.write('-outfile {}/SSeq.Classified.sINDEL.tsv\n\n'.format(
                    mounted_outdir))

            # Consensus mode: Consensus.sINDEL.vcf
            else:
                out.write(f'{container_line} \\\n')
                out.write('concat.py --bgzip-output -infiles \\\n')

                for i in range(1, input_parameters['threads'] + 1):
                    out.write('{}/{}/{}/Consensus.sINDEL.vcf'.format(
                        mounted_outdir, i, somaticdir) + ' ')

                out.write('\\\n')
                out.write('-outfile {}/Consensus.sINDEL.vcf\n\n'.format(
                    mounted_outdir))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 18
0
def bwa(input_parameters, tech='docker'):

    if input_parameters['in_fastq2']:
        paired_end = True
    else:
        paired_end = False

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    #
    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['output_directory'], input_parameters[
            'genome_reference'], input_parameters[
                'in_fastq1'], input_parameters['in_fastq2']:
        if path_i:
            all_paths.append(path_i)

    bwa_line, fileDict = container.container_params(
        input_parameters['bwa_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']
    mounted_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_fq1 = fileDict[input_parameters['in_fastq1']]['mount_path']
    mounted_fq2 = fileDict[input_parameters['in_fastq2']]['mount_path']

    temporary_files = []
    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM'] *
                                              input_parameters['threads']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        out.write(f'{bwa_line} bash -c \\\n')
        out.write('"bwa mem \\\n')
        out.write('-R \'{}\' \\\n'.format(input_parameters['bam_header']))
        out.write('-M {} -t {} \\\n'.format(
            input_parameters['extra_bwa_arguments'],
            input_parameters['threads']))
        out.write('{} \\\n'.format(mounted_reference))
        out.write('{} \\\n'.format(mounted_fq1))

        if paired_end:
            out.write('{} \\\n'.format(mounted_fq2))

        out.write('| samtools view -Sbh - \\\n')
        out.write(
            '| samtools sort -m {MEM}G --threads {THREADS} -o {DIR}/{OUTFILE}"\n\n'
            .format(MEM=math.ceil(input_parameters['MEM'] / 2),
                    THREADS=math.ceil(input_parameters['threads'] / 2),
                    DIR=mounted_outdir,
                    OUTFILE=input_parameters['out_bam']))

        out.write(f'{bwa_line} \\\n')
        out.write('samtools index -@{} {}\n'.format(
            input_parameters['threads'],
            os.path.join(mounted_outdir, input_parameters['out_bam'])))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile
Ejemplo n.º 19
0
def tumor_normal(input_parameters=DEFAULT_PARAMS, tech='docker'):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    # The following are required:
    assert os.path.exists(input_parameters['normal_bam'])
    assert os.path.exists(input_parameters['tumor_bam'])
    assert os.path.exists(input_parameters['genome_reference'])

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = []
    for path_i in input_parameters['normal_bam'], input_parameters[
            'tumor_bam'], input_parameters[
                'genome_reference'], input_parameters[
                    'output_directory'], input_parameters['inclusion_region']:
        if path_i:
            all_paths.append(path_i)

    container_line, fileDict = container.container_params(
        input_parameters['strelka2_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    # Mounted paths for all the input files and output directory:
    mounted_genome_reference = fileDict[
        input_parameters['genome_reference']]['mount_path']
    mounted_tumor_bam = fileDict[input_parameters['tumor_bam']]['mount_path']
    mounted_normal_bam = fileDict[input_parameters['normal_bam']]['mount_path']
    mounted_outdir = fileDict[
        input_parameters['output_directory']]['mount_path']

    if input_parameters['inclusion_region']:
        mounted_inclusion = fileDict[
            input_parameters['inclusion_region']]['mount_path']
        bed_gz = fileDict[
            input_parameters['inclusion_region']]['filename'] + '.gz'

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write('echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n')

        # Make .bed.gz out of .bed files using tabix:
        tabix_line, tabixDict = container.container_params(
            'lethalfang/tabix:1.7', tech, all_paths)
        tabix_selector = tabixDict[
            input_parameters['inclusion_region']]['mount_path']
        tabix_outdir = tabixDict[
            input_parameters['output_directory']]['mount_path']

        out.write(
            '{DOCKER_LINE} bash -c "cat {SELECTOR} | bgzip > {OUTDIR}/{BEDGZ}\"\n'
            .format(DOCKER_LINE=tabix_line,
                    SELECTOR=tabix_selector,
                    OUTDIR=tabix_outdir,
                    BEDGZ=bed_gz))
        out.write('{DOCKER_LINE} tabix -f {OUTDIR}/{BEDGZ}\n\n'.format(
            DOCKER_LINE=tabix_line, OUTDIR=tabix_outdir, BEDGZ=bed_gz))

        out.write(f'{container_line} \\\n')
        out.write('/opt/strelka/bin/configureStrelkaSomaticWorkflow.py \\\n')
        out.write('--tumorBam={} \\\n'.format(mounted_tumor_bam))
        out.write('--normalBam={} \\\n'.format(mounted_normal_bam))
        out.write('--referenceFasta={} \\\n'.format(mounted_genome_reference))
        out.write('--callMemMb={} \\\n'.format(
            eval(input_parameters['MEM'].rstrip('G')) * 1024))
        out.write('--callRegions={}/{} \\\n'.format(mounted_outdir, bed_gz))

        if input_parameters['exome']:
            out.write('--exome \\\n')

        if input_parameters['strelka_config_arguments']:
            out.write('{} \\\n'.format(
                input_parameters['strelka_config_arguments']))

        out.write('--runDir={}/{}\n\n'.format(mounted_outdir,
                                              input_parameters['outdir_name']))

        out.write(f'{container_line} \\\n')
        out.write('{}/{}/runWorkflow.py -m local -j 1 {}\n'.format(
            mounted_outdir, input_parameters['outdir_name'],
            input_parameters['strelka_run_arguments']))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_item = (input_parameters['action'], outfile)
    returnCode = subprocess.call(command_item)

    return outfile
Ejemplo n.º 20
0
def picard(inbams,
           outbam,
           tech='docker',
           input_parameters={},
           remove_inbams=False):

    for param_i in DEFAULT_PARAMS:
        if param_i not in input_parameters:
            input_parameters[param_i] = DEFAULT_PARAMS[param_i]

    logdir = os.path.join(input_parameters['output_directory'], 'logs')
    outfile = os.path.join(logdir, input_parameters['script'])

    all_paths = list(inbams) + [
        outbam,
    ]
    merge_line, fileDict = container.container_params(
        input_parameters['picard_image'],
        tech=tech,
        files=all_paths,
        extra_args=input_parameters['extra_docker_options'])

    mounted_outbam = fileDict[outbam]['mount_path']

    infile_string = ''
    for file_i in inbams:
        infile_string = infile_string + 'I={} '.format(
            fileDict[file_i]['mount_path'])

    picard_index_file = re.sub(r'm$', 'i', outbam)

    if outbam.endswith('.bam'):
        samtools_index_file = outbam + '.bai'
    elif outbam.endswith('.cram'):
        samtools_index_file = outbam + '.crai'
    else:
        raise Exception('Output file {} seems wrong.'.format(outbam))

    with open(outfile, 'w') as out:

        out.write("#!/bin/bash\n\n")

        out.write(f'#$ -o {logdir}\n')
        out.write(f'#$ -e {logdir}\n')
        out.write('#$ -S /bin/bash\n')
        out.write('#$ -l h_vmem={}G\n'.format(input_parameters['MEM']))
        out.write('set -e\n\n')

        out.write(
            'echo -e "Start at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n\n'
        )  # Do not change this: picard_fractional uses this to end the copying.

        out.write(f'{merge_line} \\\n')
        out.write(
            'java -Xmx{}G -jar /opt/picard.jar MergeSamFiles {} {} ASSUME_SORTED=true CREATE_INDEX=true O={}\n\n'
            .format(input_parameters['MEM'], infile_string,
                    input_parameters['extra_picard_arguments'],
                    mounted_outbam))

        if remove_inbams:
            out.write('rm {}\n\n'.format(' '.join(inbams)))

        out.write('mv {} {}\n\n'.format(picard_index_file,
                                        samtools_index_file))

        out.write('\necho -e "Done at `date +"%Y/%m/%d %H:%M:%S"`" 1>&2\n')

    # "Run" the script that was generated
    command_line = '{} {}'.format(input_parameters['action'], outfile)
    returnCode = subprocess.call(command_line, shell=True)

    return outfile