Esempio n. 1
0
def run_lumpy_preprocess(bamfile,
                         disc_reads,
                         split_reads,
                         tempdir,
                         config,
                         samtools_docker_image=None,
                         lumpy_docker_image=None):
    helpers.makedirs(tempdir)

    # disc
    unsorted_disc = os.path.join(tempdir, 'discordants.unsorted.bam')
    run_samtools_view(bamfile,
                      unsorted_disc,
                      docker_image=samtools_docker_image)
    run_samtools_sort(unsorted_disc,
                      disc_reads,
                      docker_image=samtools_docker_image)
    os.remove(unsorted_disc)

    unsorted_split = os.path.join(tempdir, 'splitters.unsorted.bam')
    run_lumpy_extract_split_reads_bwamem(bamfile,
                                         unsorted_split,
                                         config,
                                         docker_image=lumpy_docker_image)
    run_samtools_sort(unsorted_split,
                      split_reads,
                      docker_image=samtools_docker_image)
    os.remove(unsorted_split)
Esempio n. 2
0
def circos(titan_calls,
           sample_id,
           sv_calls,
           circos_plot_remixt,
           circos_plot_titan,
           tempdir,
           remixt_calls="NULL",
           docker_image=None):

    helpers.makedirs(tempdir)

    prepped_titan_calls = os.path.join(tempdir, 'prepped_titan_calls.csv')
    read_titan.make_for_circos(titan_calls, prepped_titan_calls)

    if remixt_calls != "NULL":

        prepped_remixt_calls = os.path.join(tempdir,
                                            'prepped_remixt_calls.csv')
        read_remixt.make_for_circos(remixt_calls, sample_id,
                                    prepped_remixt_calls)
    else:
        prepped_remixt_calls = remixt_calls

    # circos = ["singularity", "run", "--bind", "/admin", "--bind", "/common", "--bind",
    #           "/juno/work",  "docker://docker.io/wgspipeline/circos:v0.0.1"]

    cmd = [
        "circos.R", prepped_titan_calls, prepped_remixt_calls, sv_calls,
        circos_plot_remixt, circos_plot_titan, sample_id
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)
Esempio n. 3
0
def run_mutect_one_job(tempdir, vcf, reference, intervals, normal_bam,
                       tumour_bam):
    commands = []
    for i, interval in enumerate(intervals):
        ival_temp_dir = os.path.join(tempdir, str(i))
        helpers.makedirs(ival_temp_dir)
        unfiltered_output = os.path.join(ival_temp_dir, 'mutect.vcf.gz')
        cmd = mutect_run_command(reference, interval, normal_bam, tumour_bam,
                                 unfiltered_output)
        commands.append(cmd)

        output = os.path.join(ival_temp_dir, 'mutect.vcf.gz')
        cmd = mutect_filter_command(reference, unfiltered_output, output)
        commands.append(cmd)

    parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp')
    helpers.run_in_gnu_parallel(commands, parallel_temp_dir)

    vcf_files = [
        os.path.join(tempdir, str(i), 'mutect.vcf.gz')
        for i in range(len(intervals))
    ]
    merge_tempdir = os.path.join(tempdir, 'mutect_merge')
    helpers.makedirs(merge_tempdir)
    merge_vcfs(vcf_files, vcf, merge_tempdir)
Esempio n. 4
0
def concatenate_vcf(
        in_files, out_file, tempdir,
        allow_overlap=False):
    """ Fast concatenation of VCF file using `bcftools`.
    :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys.
    :param out_file: path where output file will be written in VCF format.
    """
    if isinstance(in_files, dict):
        in_files = in_files.values()

    helpers.makedirs(tempdir)

    merged_file = os.path.join(tempdir, 'merged.vcf')
    if allow_overlap:
        cmd = ['bcftools', 'concat', '-a', '-O', 'z', '-o', merged_file]
    else:
        cmd = ['bcftools', 'concat', '-O', 'z', '-o', merged_file]

    cmd += in_files

    pypeliner.commandline.execute(*cmd)

    # sort merged vcf file
    cmd = ['bcftools', 'sort', '-O', 'z', '-o', out_file, merged_file]
    pypeliner.commandline.execute(*cmd)

    index_vcf(out_file)
    index_bcf(out_file)
Esempio n. 5
0
def bam_collect_gc_metrics(bam_filename,
                           ref_genome,
                           metrics_filename,
                           summary_filename,
                           chart_filename,
                           tempdir,
                           mem="2G"):
    helpers.makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard',
        '-Xmx' + mem,
        '-Xms' + mem,
        '-XX:ParallelGCThreads=1',
        'CollectGcBiasMetrics',
        'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename,
        'REFERENCE_SEQUENCE=' + ref_genome,
        'S=' + summary_filename,
        'CHART_OUTPUT=' + chart_filename,
        'VALIDATION_STRINGENCY=LENIENT',
        'TMP_DIR=' + tempdir,
        'MAX_RECORDS_IN_RAM=150000',
        'QUIET=true',
    )
Esempio n. 6
0
def tar_all_data(params, segs, igv_segs, markers, parsed, plots, tar_output,
                 tempdir, chunks):
    helpers.makedirs(tempdir)

    for chunk in chunks:
        num_cluster, ploidy = chunk

        num_cluster = str(num_cluster)
        ploidy = str(ploidy)

        outdir = os.path.join(tempdir, 'numcluster_' + num_cluster,
                              'ploidy_' + ploidy)

        helpers.makedirs(outdir)

        params_outfile = os.path.join(outdir, 'params.csv')
        shutil.copyfile(params[chunk], params_outfile)

        segs_outfile = os.path.join(outdir, 'segs.csv')
        shutil.copyfile(segs[chunk], segs_outfile)

        igv_segs_outfile = os.path.join(outdir, 'igv_segs.csv')
        shutil.copyfile(igv_segs[chunk], igv_segs_outfile)

        markers_outfile = os.path.join(outdir, 'titan_markers.csv')
        shutil.copyfile(markers[chunk], markers_outfile)

        parsed_outfile = os.path.join(outdir, 'parsed.csv')
        shutil.copyfile(parsed[chunk], parsed_outfile)

        plots_outfile = os.path.join(outdir, 'plots.pdf')
        shutil.copyfile(plots[chunk], plots_outfile)

    helpers.make_tarfile(tar_output, tempdir)
Esempio n. 7
0
def split_by_rg(infile, read1_output, read2_output, tempdir):
    helpers.makedirs(tempdir)

    print("***********")
    print(tempdir)
    print(os.listdir(tempdir))
    print("***********")

    cmd = ['wgs_bamtofastq', infile, tempdir]
    pypeliner.commandline.execute(*cmd)

    print("***********")
    print(tempdir)
    print(os.listdir(tempdir))
    print("***********")

    try:
        readgroups = os.listdir(tempdir)
    except OSError:
        time.sleep(60)
        readgroups = os.listdir(tempdir)

    for readgroup in readgroups:
        os.rename(
            os.path.join(tempdir, readgroup, 'R1.fastq.gz'),
            read1_output[readgroup]
        )

        os.rename(
            os.path.join(tempdir, readgroup, 'R2.fastq.gz'),
            read2_output[readgroup]
        )
Esempio n. 8
0
def bam_collect_wgs_metrics(bam_filename,
                            ref_genome,
                            metrics_filename,
                            config,
                            tempdir,
                            mem="2G",
                            docker_image=None):
    helpers.makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard',
        '-Xmx' + mem,
        '-Xms' + mem,
        '-XX:ParallelGCThreads=1',
        'CollectWgsMetrics',
        'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename,
        'REFERENCE_SEQUENCE=' + ref_genome,
        'MINIMUM_BASE_QUALITY=' + str(config['min_bqual']),
        'MINIMUM_MAPPING_QUALITY=' + str(config['min_mqual']),
        'COVERAGE_CAP=500',
        'VALIDATION_STRINGENCY=LENIENT',
        'COUNT_UNPAIRED=' + ('True' if config['count_unpaired'] else 'False'),
        'TMP_DIR=' + tempdir,
        'MAX_RECORDS_IN_RAM=150000',
        docker_image=docker_image)
Esempio n. 9
0
def run_samtools_germline_one_job(tempdir,
                                  vcf,
                                  reference,
                                  intervals,
                                  bam_file,
                                  samtools_docker_image=None,
                                  vcftools_docker_image=None):
    commands = []
    for i, interval in enumerate(intervals):
        ival_temp_dir = os.path.join(tempdir, str(i))
        helpers.makedirs(ival_temp_dir)
        output = os.path.join(ival_temp_dir, 'germline.vcf.gz')
        cmd = samtools_germline_command(output, reference, interval, bam_file)
        commands.append(cmd)

    parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp')
    helpers.run_in_gnu_parallel(commands, parallel_temp_dir,
                                samtools_docker_image)

    vcf_files = [
        os.path.join(tempdir, str(i), 'germline.vcf.gz')
        for i in range(len(intervals))
    ]
    merge_tempdir = os.path.join(tempdir, 'germline_merge')
    helpers.makedirs(merge_tempdir)
    merge_vcfs(vcf_files,
               vcf,
               merge_tempdir,
               docker_image=vcftools_docker_image)
Esempio n. 10
0
def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir,
                          **kwargs):
    temp_out_dir = os.path.join(temp_dir, 'out')
    temp_tmp_dir = os.path.join(temp_dir, 'tmp')
    helpers.makedirs(temp_out_dir)
    helpers.makedirs(temp_tmp_dir)

    pypeliner.commandline.execute('fastqc', '--outdir=' + temp_out_dir,
                                  '--dir=' + temp_tmp_dir, fastq_filename,
                                  **kwargs)

    fastq_basename = os.path.basename(fastq_filename)
    if fastq_basename.endswith(".fastq.gz"):
        fastq_basename = fastq_basename[:-len(".fastq.gz")]
    elif fastq_basename.endswith(".fq.gz"):
        fastq_basename = fastq_basename[:-len(".fq.gz")]
    elif fastq_basename.endswith(".fq"):
        fastq_basename = fastq_basename[:-len(".fq")]
    elif fastq_basename.endswith(".fastq"):
        fastq_basename = fastq_basename[:-len(".fastq")]
    else:
        raise Exception("Unknown file type")

    output_basename = os.path.join(temp_out_dir, fastq_basename)

    shutil.move(output_basename + '_fastqc.zip', output_plots)
    shutil.move(output_basename + '_fastqc.html', output_html)
Esempio n. 11
0
def generate_pipeline_config(args):

    if args['which'] == 'generate_config':
        config_yaml = args['pipeline_config']
        config_yaml = os.path.abspath(config_yaml)
    else:
        config_yaml = "config.yaml"
        tmpdir = args.get("tmpdir", None)
        pipelinedir = args.get("pipelinedir", None)

        # use pypeliner tmpdir to store yaml
        if pipelinedir:
            config_yaml = os.path.join(pipelinedir, config_yaml)
        elif tmpdir:
            config_yaml = os.path.join(tmpdir, config_yaml)
        else:
            warnings.warn("no tmpdir specified, generating configs in working dir")
            config_yaml = os.path.join(os.getcwd(), config_yaml)

        config_yaml = helpers.get_incrementing_filename(config_yaml)
    print config_yaml

    params_override = {'cluster': 'azure', 'reference': 'grch37'}
    if args['config_override']:
        params_override.update(args["config_override"])

    helpers.makedirs(config_yaml, isfile=True)

    config = get_config(params_override)
    write_config(config, config_yaml)

    args["config_file"] = config_yaml

    print config_yaml
    return args
Esempio n. 12
0
def run_samtools_germline_one_job(tempdir, vcf, reference, intervals,
                                  bam_file):
    commands = []
    for i, interval in enumerate(intervals):
        ival_temp_dir = os.path.join(tempdir, str(i))
        helpers.makedirs(ival_temp_dir)
        output = os.path.join(ival_temp_dir, 'germline.vcf.gz')
        cmd = samtools_germline_command(output, reference, interval, bam_file)
        commands.append(cmd)

    parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp')
    helpers.run_in_gnu_parallel(commands, parallel_temp_dir)

    vcf_files = [
        os.path.join(tempdir, str(i), 'germline.vcf.gz')
        for i in range(len(intervals))
    ]
    merge_tempdir = os.path.join(tempdir, 'germline_merge')
    helpers.makedirs(merge_tempdir)

    temp_vcf = os.path.join(merge_tempdir, 'merged_rtg.vcf')
    merge_vcfs(vcf_files, temp_vcf, merge_tempdir)

    normal_id = bamutils.get_sample_id(bam_file)
    vcfutils.update_germline_header_sample_ids(temp_vcf, vcf, normal_id)
Esempio n. 13
0
def bam_sort(bam_filename, sorted_bam_filename, tempdir, threads=1, mem="2G"):
    helpers.makedirs(tempdir)

    prefix = os.path.join(tempdir, 'samtools_sort')

    pypeliner.commandline.execute('samtools', 'sort', '-@', threads, '-m', mem,
                                  bam_filename, '-o', sorted_bam_filename,
                                  '-T', prefix)
Esempio n. 14
0
def run_mutect(vcf, reference, interval, normal_bam, tumour_bam, tempdir):
    helpers.makedirs(tempdir)
    unfiltered_vcf = os.path.join(tempdir, 'temp.vcf')
    cmd = mutect_run_command(reference, interval, normal_bam, tumour_bam,
                             unfiltered_vcf)
    pypeliner.commandline.execute(*cmd)

    cmd = mutect_filter_command(reference, unfiltered_vcf, vcf)
    pypeliner.commandline.execute(*cmd)
Esempio n. 15
0
def run_samtools_germline(vcf, reference, interval, bam_file, tempdir):
    helpers.makedirs(tempdir)
    vcf_file = os.path.join(tempdir, 'samtools_snps.vcf.gz')

    cmd = samtools_germline_command(vcf_file, reference, interval, bam_file)
    pypeliner.commandline.execute(*cmd)

    normal_id = bamutils.get_sample_id(bam_file)
    vcfutils.update_germline_header_sample_ids(vcf_file, vcf, normal_id)
Esempio n. 16
0
def run_freebayes_germline(vcf, reference, interval, bam_file, tempdir):
    helpers.makedirs(tempdir)
    temp_vcf = os.path.join(tempdir, 'temp.vcf')

    cmd = freebayes_germline_command(temp_vcf, reference, interval, bam_file)
    pypeliner.commandline.execute(*cmd)

    normal_id = bamutils.get_sample_id(bam_file)
    vcfutils.update_germline_header_sample_ids(temp_vcf, vcf, normal_id)
Esempio n. 17
0
def get_outfiles(outdir, readgroups):
    outfiles = {}

    for readgroup in readgroups:
        helpers.makedirs(os.path.join(outdir, readgroup))
        r1 = os.path.join(outdir, readgroup, 'R1.fastq.gz')
        r2 = os.path.join(outdir, readgroup, 'R2.fastq.gz')
        outfiles[readgroup] = (r1, r2)

    return outfiles
Esempio n. 18
0
def run_museq_one_job(tempdir,
                      museq_vcf,
                      reference,
                      intervals,
                      museq_params,
                      tumour_bam=None,
                      normal_bam=None,
                      titan_mode=False):
    '''
    Run museq script for all chromosomes and merge VCF files

    :param tumour: path to tumour bam
    :param normal: path to normal bam
    :param out: path to the temporary output VCF file for the merged VCF files
    :param log: path to the log file
    :param config: path to the config YAML file
    '''

    commands = []
    for i, interval in enumerate(intervals):
        ival_temp_dir = os.path.join(tempdir, str(i))
        helpers.makedirs(ival_temp_dir)
        output = os.path.join(ival_temp_dir, 'museq.vcf')
        log = os.path.join(ival_temp_dir, 'museq.log')

        command = run_museq(output,
                            log,
                            reference,
                            interval,
                            museq_params,
                            ival_temp_dir,
                            tumour_bam=tumour_bam,
                            normal_bam=normal_bam,
                            return_cmd=True,
                            titan_mode=titan_mode)

        commands.append(command)

    parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp')
    helpers.run_in_gnu_parallel(commands, parallel_temp_dir)

    vcf_files = [
        os.path.join(tempdir, str(i), 'museq.vcf')
        for i in range(len(intervals))
    ]
    merge_tempdir = os.path.join(tempdir, 'museq_merge')
    helpers.makedirs(merge_tempdir)
    temp_museq_vcf = os.path.join(merge_tempdir, 'temp_museq_merge.vcf')
    merge_vcfs(vcf_files, temp_museq_vcf, merge_tempdir)

    tumour_id = get_sample_id(tumour_bam)
    normal_id = get_sample_id(normal_bam)
    update_header_sample_ids(temp_museq_vcf, museq_vcf, tumour_id, normal_id)
Esempio n. 19
0
def roh_calling(samtools_germlines, roh_output, tempdir):
    helpers.makedirs(tempdir)

    output = os.path.join(tempdir, 'output.csv')

    cmd = [
        'bcftools', 'roh', '-G30', '--AF-dflt', 0.4, samtools_germlines, '>',
        output
    ]

    pypeliner.commandline.execute(*cmd)

    parse_roh_output(output, roh_output)
Esempio n. 20
0
def annotate_maf_with_oncokb(maf, api_key, tmpspace, annotated_maf):
    '''
    annotate maf with oncokb 
    Parameters
    ----------
    maf :  maf path to annotate 
    somatic_mafs : somatic maf path dictionary
    merged_maf: merged output
    Returns
    -------
    '''
    helpers.makedirs(tmpspace)
    ma.annotate(maf, annotated_maf, api_key)
Esempio n. 21
0
def circos(titan_calls, remixt_calls, sample_id, sv_calls, circos_plot_remixt,
           circos_plot_titan, tempdir):
    helpers.makedirs(tempdir)

    script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'scripts', 'circos.R')

    cmd = [
        'Rscript', script_path, titan_calls, remixt_calls, sv_calls,
        circos_plot_remixt, circos_plot_titan, sample_id
    ]

    pypeliner.commandline.execute(*cmd)
Esempio n. 22
0
def bam_collect_insert_metrics(bam_filename,
                               flagstat_metrics_filename,
                               metrics_filename,
                               histogram_filename,
                               tempdir,
                               mem="2G"):
    bam_flagstat(
        bam_filename,
        flagstat_metrics_filename,
    )

    # Check if any paired reads exist
    has_paired = None
    with open(flagstat_metrics_filename) as f:
        for line in f:
            if 'properly paired' in line:
                if line.startswith('0 '):
                    has_paired = False
                else:
                    has_paired = True

    if has_paired is None:
        raise Exception(
            'Unable to determine number of properly paired reads from {}'.
            format(flagstat_metrics_filename))

    if not has_paired:
        with open(metrics_filename, 'w') as f:
            f.write('## FAILED: No properly paired reads\n')
        with open(histogram_filename, 'w'):
            pass
        return

    helpers.makedirs(tempdir)

    pypeliner.commandline.execute(
        'picard',
        '-Xmx' + mem,
        '-Xms' + mem,
        '-XX:ParallelGCThreads=1',
        'CollectInsertSizeMetrics',
        'INPUT=' + bam_filename,
        'OUTPUT=' + metrics_filename,
        'HISTOGRAM_FILE=' + histogram_filename,
        'ASSUME_SORTED=True',
        'VALIDATION_STRINGENCY=LENIENT',
        'TMP_DIR=' + tempdir,
        'MAX_RECORDS_IN_RAM=150000',
        'QUIET=true',
    )
Esempio n. 23
0
def merge_pdfs(infiles, outfile):
    if isinstance(infiles, dict):
        infiles = infiles.values()

    merger = PdfFileMerger()

    for infile in infiles:
        # add it to list if not empty. skip empty files to avoid errors later
        if os.path.getsize(infile):
            merger.append(open(infile, 'rb'))

    helpers.makedirs(outfile, isfile=True)

    with open(outfile, 'wb') as fout:
        merger.write(fout)
Esempio n. 24
0
def parse_remixt_file(input, outputs, tables, tempdir):
    helpers.makedirs(tempdir)

    with pd.HDFStore(input) as data_store:
        for output, table in zip(outputs, tables):
            tempout = os.path.join(tempdir,
                                   '{}.csv'.format(table.replace('/', '_')))

            df = data_store[table]

            if isinstance(df, pd.Series):
                df = pd.DataFrame({table: df})

            df.to_csv(tempout, index=False)
            csvutils.finalize_csv(tempout, output, sep=',')
Esempio n. 25
0
def run_vcf2maf(
    vcf_file,
    maf_output,
    tempdir,
    reference,
    tumour_id=None,
    normal_id=None,
):
    if os.path.exists(tempdir):
        helpers.rmdirs(tempdir)

    helpers.makedirs(tempdir)

    input_vcf = os.path.join(tempdir, os.path.basename(vcf_file))
    shutil.copyfile(vcf_file, input_vcf)

    if vcf_file.endswith('.gz'):
        vcf_unzipped = os.path.join(tempdir, 'unzipped_vcf.vcf')
        gunzip_file(input_vcf, vcf_unzipped)
    else:
        vcf_unzipped = input_vcf

    assert vcf_unzipped.endswith('.vcf')
    vcf_unzipped_vep = vcf_unzipped[:-4]
    vcf_unzipped_vep = vcf_unzipped_vep + '.vep.vcf'

    if os.path.exists(vcf_unzipped_vep):
        os.remove(vcf_unzipped_vep)

    cmd = [
        'vcf2maf',
        vcf_unzipped,
        maf_output,
        os.path.join(reference, 'homo_sapiens', '99_GRCh37',
                     'Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz'),
        os.path.join(reference, 'ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz'),
        reference,
    ]

    if tumour_id:
        cmd.extend(['--tumor-id', tumour_id])
    if normal_id:
        cmd.extend(['--normal-id', normal_id])

    pypeliner.commandline.execute(*cmd)
Esempio n. 26
0
def parse_vcf(infile, primary_table, snpeff_table, ma_table, id_table,
              parse_config, chromosomes, tempdir):
    '''
    parses a vcf containing variant calls
    to a CSV.

    :param infile: vcf containing calls and annotations
    :param primary_table:csv output filepath containing base calls
    :param snpeff_table: csv output filepath containing snpeff annotations
    :param ma_table: csv output filepath containing ma annotations
    :param id_table: csv output filepath containg id annotations
    :param parse_config: config?? currently unused
    :param parse_low_mappability: boolean; whether or not to filter low-mappability calls
    ##assuming there will by a path to a blacklisted calls table in config
    '''

    helpers.makedirs(tempdir)

    primary_temp = os.path.join(tempdir, 'primary.csv')
    snpeff_temp = os.path.join(tempdir, 'snpeff.csv')
    ma_temp = os.path.join(tempdir, 'ma.csv')
    ids_temp = os.path.join(tempdir, 'ids.csv')

    filter_out = []
    if 'filter_low_mappability' in parse_config and parse_config[
            'filter_low_mappability']:
        filter_out.append(('LOW_MAPPABILITY', 'eq', True))

    if chromosomes:
        filter_out.append(('CHROM', 'notin', chromosomes))

    if 'pr_threshold' in parse_config and parse_config['pr_threshold']:
        filter_out.append(('PR', 'lt', parse_config['pr_threshold']))

    with vcfparser.VcfParser(infile, primary_temp, snpeff_temp, ma_temp,
                             ids_temp, filter_out) as vcf_parser:
        vcf_parser.write()

    csvutils.finalize_csv(primary_temp, primary_table)
    csvutils.finalize_csv(snpeff_temp, snpeff_table)
    csvutils.finalize_csv(ma_temp, ma_table)
    csvutils.finalize_csv(ids_temp, id_table)
Esempio n. 27
0
def svaba_cmd(tumor,
              normal,
              reference,
              tempdir,
              region=None,
              ncores=None,
              sample_id='sample'):
    helpers.makedirs(tempdir)
    tempdir = os.path.join(tempdir, sample_id)

    cmd = [
        'svaba', 'run', '-t', tumor, '-n', normal, '-G', reference, '-z', '-a',
        tempdir
    ]

    if region:
        cmd += ['-k', region]

    if ncores:
        cmd += ['-p', ncores]

    return cmd
Esempio n. 28
0
def split_by_rg(infile, read1_output, read2_output, tempdir,
                ignore_bamtofastq_exception):
    helpers.makedirs(tempdir)

    cmd = ['wgs_bamtofastq', infile, tempdir]

    if ignore_bamtofastq_exception:
        cmd.append('--ignore_bamtofastq_exception')
    pypeliner.commandline.execute(*cmd)

    try:
        readgroups = os.listdir(tempdir)
    except OSError:
        time.sleep(60)
        readgroups = os.listdir(tempdir)

    for readgroup in readgroups:
        os.rename(os.path.join(tempdir, readgroup, 'R1.fastq.gz'),
                  read1_output[readgroup])

        os.rename(os.path.join(tempdir, readgroup, 'R2.fastq.gz'),
                  read2_output[readgroup])
Esempio n. 29
0
def plot_hmm(
        tumour_copy,
        hmmcopy_res,
        correction_plots_dir,
        hmmcopy_plots_dir,
        bias_pdf,
        correction_pdf,
        hmmcopy_pdf,
        docker_image=None
):
    helpers.makedirs(correction_plots_dir)
    helpers.makedirs(hmmcopy_plots_dir)

    cmd = [
        'plot_hmmcopy.R',
        tumour_copy,
        hmmcopy_res,
        correction_plots_dir,
        bias_pdf,
        hmmcopy_plots_dir,
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)

    correction_pdfs = [os.path.join(correction_plots_dir, f)
                       for f in os.listdir(correction_plots_dir) if f.endswith('.pdf')]
    pdfutils.merge_pdfs(correction_pdfs, correction_pdf)

    all_hmmcopy_pdfs = [os.path.join(hmmcopy_plots_dir, pdf)
                            for pdf in os.listdir(hmmcopy_plots_dir)]
    # just some sorting
    human_pdfs = [os.path.join(hmmcopy_plots_dir, 'chr_{}.pdf'.format(chrom))
                      for chrom in map(str, range(1,23)) + ['X']]
    all_hmmcopy_pdfs = [v for v in human_pdfs if v in all_hmmcopy_pdfs]
    all_hmmcopy_pdfs += list(set(all_hmmcopy_pdfs) - set(human_pdfs))

    pdfutils.merge_pdfs(human_pdfs, hmmcopy_pdf)
Esempio n. 30
0
def generate_submit_config_in_temp(args):
    azure_submit = ['azurebatch',
                    'pypeliner.contrib.azure.batchqueue.AzureJobQueue']
    if not args.get("submit", None) in azure_submit:
        return args

    if args['which'] == 'generate_config':
        return args

    batch_yaml = "batch.yaml"
    tmpdir = args.get("tmpdir", None)
    pipelinedir = args.get("pipelinedir", None)

    # use pypeliner tmpdir to store yaml
    if pipelinedir:
        batch_yaml = os.path.join(pipelinedir, batch_yaml)
    elif tmpdir:
        batch_yaml = os.path.join(tmpdir, batch_yaml)
    else:
        logging.getLogger("wgs.generate_batch_config").warn(
            "no tmpdir specified, generating configs in working dir"
        )
        batch_yaml = os.path.join(os.getcwd(), batch_yaml)

    helpers.makedirs(batch_yaml, isfile=True)

    batch_yaml = helpers.get_incrementing_filename(batch_yaml)

    params_override = args.get("config_override", {})

    config_params = get_batch_params(override=params_override)
    config = get_batch_config(config_params, override=params_override)
    write_config(config, batch_yaml)

    args["submit_config"] = batch_yaml

    return args