def process_sample_coverage(job, addresses, keyspace, auth, sample, program, samples):
    connection.setup(addresses, keyspace, auth_provider=auth)

    with open("{}.sambamba_coverage.bed".format(samples[sample]['library_name']), 'rb') as coverage:
        reader = csv.reader(coverage, delimiter='\t')
        header = reader.next()
        threshold_indices = list()
        thresholds = list()
        index = 0
        for element in header:
            if element.startswith("percentage"):
                threshold = element.replace('percentage', '')
                threshold_indices.append(index)
                thresholds.append(int(threshold))
            index += 1

        for row in reader:
            threshold_data = defaultdict(float)
            index = 0
            for threshold in thresholds:
                threshold_data[threshold] = row[threshold_indices[index]]
                index += 1

            sample_data = SampleCoverage.create(sample=samples[sample]['sample_name'],
                                                library_name=samples[sample]['library_name'],
                                                run_id=samples[sample]['run_id'],
                                                num_libraries_in_run=samples[sample]['num_libraries_in_run'],
                                                sequencer_id=samples[sample]['sequencer'],
                                                program_name=program,
                                                extraction=samples[sample]['extraction'],
                                                panel=samples[sample]['panel'],
                                                target_pool=samples[sample]['target_pool'],
                                                amplicon=row[3],
                                                num_reads=row[4],
                                                mean_coverage=row[5],
                                                thresholds=thresholds,
                                                perc_bp_cov_at_thresholds=threshold_data)

            amplicon_data = AmpliconCoverage.create(amplicon=row[3],
                                                    sample=samples[sample]['sample_name'],
                                                    library_name=samples[sample]['library_name'],
                                                    run_id=samples[sample]['run_id'],
                                                    num_libraries_in_run=samples[sample]['num_libraries_in_run'],
                                                    sequencer_id=samples[sample]['sequencer'],
                                                    program_name=program,
                                                    extraction=samples[sample]['extraction'],
                                                    panel=samples[sample]['panel'],
                                                    target_pool=samples[sample]['target_pool'],
                                                    num_reads=row[4],
                                                    mean_coverage=row[5],
                                                    thresholds=thresholds,
                                                    perc_bp_cov_at_thresholds=threshold_data)
Exemple #2
0
def subsample_bam(job, addresses, keyspace, auth, name, samples, config, seed, fraction, iteration):
    """Use samtools view to subsample an input file to the specified fraction"""

    library_name = "subsample-{}-{}-{}".format(samples[name]['library_name'], fraction, iteration)
    sublog = "subsample-{}-{}-{}.log".format(name, fraction, iteration)
    input_bam = "{}.recalibrated.sorted.bam".format(samples[name]['library_name'])
    subsampled_bam = "subsample-{}-{}-{}.bam".format(samples[name]['library_name'], fraction, iteration)
    samcommand = "samtools view -s {seed}.{fraction} -b {input} > {output}".format(seed=seed,
                                                                                   fraction=fraction,
                                                                                   input=input_bam,
                                                                                   output=subsampled_bam)

    index_command = "samtools index {}".format(subsampled_bam)
    index_log = "{}.index.log".format(subsampled_bam)

    output = "{}.sambamba_coverage.bed".format(subsampled_bam)
    logfile = "{}.sambamba_coverage.log".format(subsampled_bam)

    command = ("{}".format(config['sambamba']['bin']),
               "depth region",
               "-L",
               "{}".format(samples[name]['regions']),
               "-t",
               "{}".format(config['sambamba']['num_cores']),
               "-T",
               "{}".format(config['coverage_threshold']),
               "-T",
               "{}".format(config['coverage_threshold2']),
               "{}".format(subsampled_bam),
               ">",
               "{}".format(output))

    job.fileStore.logToMaster("Samtools ViewCommand: {}\n".format(samcommand))
    pipeline.run_and_log_command(samcommand, sublog)

    job.fileStore.logToMaster("Samtools Index Command: {}\n".format(index_command))
    pipeline.run_and_log_command(index_command, index_log)

    job.fileStore.logToMaster("SamBamba Coverage Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    connection.setup(addresses, keyspace, auth_provider=auth)

    job.fileStore.logToMaster("Adding coverage data: {}\n".format(samcommand))

    num_libs = (float(samples[name]['num_libraries_in_run']) * (1 / (float(fraction) / 100.00)))
    with open(output, 'rb') as coverage:
        reader = csv.reader(coverage, delimiter='\t')
        header = reader.next()
        threshold_indices = list()
        thresholds = list()
        index = 0
        for element in header:
            if element.startswith("percentage"):
                threshold = element.replace('percentage', '')
                threshold_indices.append(index)
                thresholds.append(int(threshold))
            index += 1

        for row in reader:
            threshold_data = defaultdict(float)
            index = 0
            for threshold in thresholds:
                threshold_data[threshold] = row[threshold_indices[index]]
                index += 1

            sample_data = SampleCoverage.create(sample=samples[name]['sample_name'],
                                                library_name=library_name,
                                                run_id="subsample-{}".format(fraction),
                                                num_libraries_in_run=num_libs,
                                                sequencer_id=samples[name]['sequencer'],
                                                program_name="sambamba",
                                                extraction=samples[name]['extraction'],
                                                panel=samples[name]['panel'],
                                                target_pool=samples[name]['target_pool'],
                                                amplicon=row[3],
                                                num_reads=row[4],
                                                mean_coverage=row[5],
                                                thresholds=thresholds,
                                                perc_bp_cov_at_thresholds=threshold_data)

            amplicon_data = AmpliconCoverage.create(amplicon=row[3],
                                                    sample=samples[name]['sample_name'],
                                                    library_name=library_name,
                                                    run_id="subsample-{}".format(fraction),
                                                    num_libraries_in_run=num_libs,
                                                    sequencer_id=samples[name]['sequencer'],
                                                    program_name="sambamba",
                                                    extraction=samples[name]['extraction'],
                                                    panel=samples[name]['panel'],
                                                    target_pool=samples[name]['target_pool'],
                                                    num_reads=row[4],
                                                    mean_coverage=row[5],
                                                    thresholds=thresholds,
                                                    perc_bp_cov_at_thresholds=threshold_data)