Exemple #1
0
def write_coverage(cnf,
                   output_dir,
                   chrom,
                   depths_by_pos,
                   cov_thresholds,
                   sample_index=None):
    coverage_data_fpath = join(output_dir, chrom + '.txt')
    if not cnf.reuse_intermediate or (
            not verify_file(coverage_data_fpath, silent=True)
            and not verify_file(coverage_data_fpath + '.gz', silent=True)):
        chrom_num = chrom.replace('chr', '')
        with file_transaction(cnf.work_dir, coverage_data_fpath) as tx:
            with open(tx, 'w') as f:
                fs = ['#chrom', 'pos', 'mean', 'median'
                      ] + [str(t) for t in cov_thresholds]
                f.write('\t'.join(fs) + '\n')
                sorted_positions = sorted(depths_by_pos.keys())
                for pos in sorted_positions:
                    depths = depths_by_pos[pos] if sample_index is None else [
                        depths_by_pos[pos][sample_index]
                    ]
                    mean_coverage = mean(depths)
                    median_coverage = median(depths)
                    pcnt_samples_ge_threshold = [
                        mean([1 if d >= t else 0 for d in depths])
                        for t in cov_thresholds
                    ]
                    res_line = chrom_num + '\t' + str(pos) + '\t' + str(
                        mean_coverage) + '\t' + str(median_coverage)
                    for pcnt_samples in pcnt_samples_ge_threshold:
                        res_line += '\t' + str(pcnt_samples)
                    f.write(res_line + '\n')
    bgzip_and_tabix(cnf, coverage_data_fpath, tabix_parameters='-p bed')
Exemple #2
0
def get_regions_coverage(cnf, samples):
    cov_thresholds = [1, 5, 10, 15, 20, 25, 30, 50, 100]
    depths_by_pos = defaultdict(lambda: [0] * len(samples))
    info()
    info('Coverage to bedgraph for ' + cnf.chrom)
    coverage_fpaths = []
    for index, sample in enumerate(samples):
        coverage_fpath = join(cnf.work_dir,
                              sample.name + '_' + cnf.chrom + '.bedgraph')
        coverage_fpath = get_bedgraph_coverage(cnf,
                                               sample.bam,
                                               chr_len_fpath=cnf.chr_len_fpath,
                                               bed_fpath=cnf.bed,
                                               output_fpath=coverage_fpath,
                                               exit_on_error=False)
        if coverage_fpath and verify_file(coverage_fpath):
            coverage_fpaths.append(coverage_fpath)
            for line in open(coverage_fpath):
                if line.startswith('#'):
                    continue
                chrom, start, end, depth = line.split('\t')
                start, end, depth = map(int, (start, end, depth))
                for pos in xrange(start, end):
                    depths_by_pos[pos][index] = depth

    info()
    if not coverage_fpaths:
        warn(cnf.chrom + ' is not covered in all samples')
        return None

    info()
    info('Writing coverage for ' + cnf.chrom)
    write_coverage(cnf, cnf.output_dir, cnf.chrom, depths_by_pos,
                   cov_thresholds)
    for index, sample in enumerate(samples):
        info('Writing coverage for ' + sample.name + ', ' + chrom)
        sample_output_dirpath = join(cnf.output_dir, sample.name)
        output_fpath = join(sample_output_dirpath, chrom + '.txt.gz')
        if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            continue
        write_coverage(cnf,
                       sample_output_dirpath,
                       cnf.chrom,
                       depths_by_pos,
                       cov_thresholds,
                       sample_index=index)
        if not verify_file(output_fpath, silent=True):
            warn(sample.name + ' has no coverage at chromosome ' + chrom)
    return depths_by_pos
Exemple #3
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=None,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True)
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        sample_name = cnf.sample_name
        if not sample_name:
            sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath)
        results_dirpath = run_fastq(cnf,
                                    sample_name,
                                    left_reads_fpath,
                                    right_reads_fpath,
                                    output_dirpath,
                                    downsample_to=cnf.downsample_to)

    verify_dir(results_dirpath, is_critical=True)
    info()
    info('*' * 70)
    info('Fastqc results:')
    info('  ' + results_dirpath)
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int',
        help='Downsample reads to avoid excessive processing times with large files. '
            'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()

    if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir:
        parser.print_usage()

    verify_file(opts.left_reads_fpath, is_critical=False)
    left_reads_fpath = adjust_path(opts.left_reads_fpath)
    verify_file(opts.right_reads_fpath, is_critical=False)
    right_reads_fpath = adjust_path(opts.right_reads_fpath)
    output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True)

    left_reads_fpath, right_reads_fpath, output_dirpath =\
        map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath])

    ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******')
    fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py')
    fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM)
    fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM)

    cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}'
    if opts.sample_name:
        cmdl += ' --sample {opts.sample_name}'
    if opts.downsample_to:
        cmdl += ' --downsample-to ' + str(int(opts.downsample_to))
    cmdl = cmdl.format(**locals())
    cmdl += ' 2>&1'
    info(cmdl)
    stdin, stdout, stderr = ssh.exec_command(cmdl)
    for l in stdout:
        err(l, ending='')
    info()
    ssh.close()
def save_all_mutations_depth(cnf, infos_by_key):
    mut_bed_fpath = join(cnf.work_dir, 'mutations.bed')

    if not cnf.reuse_intermediate or not verify_file(mut_bed_fpath):
        all_mutations_pos = defaultdict(set)
        for e in infos_by_key.values():
            for mut in e.mutations:
                all_mutations_pos[mut.chrom].add(mut.pos)
        with file_transaction(cnf.work_dir, mut_bed_fpath) as tx:
            with open(tx, 'w') as out_f:
                for chrom, positions in all_mutations_pos.iteritems():
                    for pos in positions:
                        out_f.write('\t'.join(
                            [chrom, str(pos -
                                        1), str(pos)]) + '\n')

    sambamba_output_by_experiment = run_sambamba_use_grid(
        cnf, infos_by_key, mut_bed_fpath)

    for e, sambamba_output_fpath in sambamba_output_by_experiment.iteritems():
        regions = parse_sambamba_depth_output(e.sample.name,
                                              sambamba_output_fpath)
        depth_dict = defaultdict()
        for region in regions:
            depth_dict[region.end] = region.avg_depth
        e.mutations_depth = depth_dict
Exemple #6
0
def extract_graphs(samples):  # Sample(name, fastq_fpath)
    parsed_data = OrderedDict((h, list()) for h in _header)

    for s in samples:
        if verify_file(s.fastqc_html_fpath,
                       's.fastqc_html_fpath for ' + s.name):
            with open(s.fastqc_html_fpath) as source_file_obj:
                html = source_file_obj.read()
                parts = [
                    p.split('</div>')[0]
                    for p in html.split('<div class="module">')[1:]
                ]
                # <h2><img/></h2><table></table></div>  OR  <h2><img/></h2><p><img/></p></div>
                for i, part in enumerate(parts):
                    # info('Parsing ' + _header[i])
                    # info(str(part))
                    table, graph = '', ''
                    ok_img = '<img ' + part.split('"><img')[1].split(
                        '>')[0] + '>'
                    if '<table>' in part:
                        table = '<table>' + part.split('<table>')[1]
                    if '<p><img ' in part:
                        graph = '<img ' + part.split('<p><img')[1].split(
                            '>')[0] + '>'
                    parsed_data[_header[i]].append(
                        [s.name, ok_img, graph, table])

                # module_divs = soup.find_all("div", class_="module")
                # _sort_graph_by_type(parsed_data, module_divs, s.name)
                # soup.decompose()
        else:
            err('Could not find fastqc html fpath for sample ' + s.name +
                ': ' + str(s.fastqc_html_fpath))

    return parsed_data
def get_key_genes(key_genes_fpath):
    key_genes_fpath = verify_file(key_genes_fpath,
                                  is_critical=True,
                                  description='820 AZ key genes')
    with open(key_genes_fpath) as f:
        key_gene_names = set(
            [l.strip() for l in f.readlines() if l.strip() != ''])

    return key_gene_names
Exemple #8
0
def run_fastq(cnf,
              sample_name,
              l_r_fpath,
              r_r_fpath,
              output_dirpath,
              downsample_to=1e7):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)

    if downsample_to:
        info('Downsampling to ' + str(downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample_name,
                                      l_r_fpath,
                                      r_r_fpath,
                                      downsample_to,
                                      output_dir=cnf.work_dir)

    # Joining fastq files to run on a combination
    fastqc_fpath = join(cnf.work_dir, sample_name + '.fq')
    info('Combining fastqs, writing to ' + fastqc_fpath)
    with open(fastqc_fpath, 'w') as out:
        out.write(open_gzipsafe(l_r_fpath).read())
        out.write(open_gzipsafe(r_r_fpath).read())

    # Running FastQC
    info('Running FastQC')
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format(
        **locals())
    call(cnf, cmdline)

    # Cleaning and getting report
    sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc')
    if isfile(sample_fastqc_dirpath + '.zip'):
        os.remove(sample_fastqc_dirpath + '.zip')
    fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html')
    verify_file(fastqc_html_fpath, is_critical=True)

    return sample_fastqc_dirpath
Exemple #9
0
def parse_gene_counts(counts_fpath, key_gene_names, report_name,
                      keep_gene_names):
    gene_counts = defaultdict(list)
    info('Preparing ' + report_name + ' stats for expression heatmaps')
    info('Checking ' + counts_fpath)
    if not verify_file(counts_fpath):
        err('Cannot find ' + report_name + ' fpath')
        return []

    info('Reading ' + report_name + ' from ' + counts_fpath)
    samples_cols = dict()
    samples = []
    gene_col = None

    with open(counts_fpath) as f:
        for i, l in enumerate(f):
            if i == 0:
                header = l.strip().split('\t')
                gene_col = header.index('HUGO')
                samples = header[1:gene_col]
                samples_cols = {
                    sample: col + 1
                    for col, sample in enumerate(samples)
                }
                continue
            fs = l.replace('\n', '').split('\t')
            gene_name = fs[gene_col]
            if key_gene_names and gene_name not in key_gene_names:
                continue
            gene_expression_dict = {
                sample: int(float(fs[col]))
                if float(fs[col]).is_integer() else float(fs[col])
                for sample, col in samples_cols.iteritems()
            }
            if all(v < HEATMAPS_MIN_COUNT
                   for v in gene_expression_dict.values()):
                continue
            is_hidden_row = False
            name = gene_name
            if ':' in fs[0]:  ## exon number
                is_hidden_row = True
                exon_number = fs[0].split(':')[1]
                name += ':' + exon_number
            if keep_gene_names:
                is_hidden_row = True
                name = fs[0]  # use id
            gene = Counts(name,
                          gene_name=gene_name,
                          counts=gene_expression_dict,
                          is_hidden_row=is_hidden_row)
            gene_counts[gene_name].append(gene)

    return gene_counts, samples
Exemple #10
0
def annotate_gene_counts(cnf, counts_fpath, ann_counts_fpath, genes_dict):
    unannotated_fpath = counts_fpath
    if not verify_file(unannotated_fpath):
        critical('Not found counts ' + unannotated_fpath)
    with file_transaction(cnf.work_dir, ann_counts_fpath) as tx:
        with open(tx, 'w') as annotated_f:
            with open(unannotated_fpath) as f:
                for i, l in enumerate(f):
                    if i == 0:
                        header = l.replace('\n', '').split('\t')
                        l = '\t'.join(header + ['HUGO'])
                        annotated_f.write(l + '\n')
                        continue
                    fs = l.replace('\n', '').split('\t')
                    gene_and_exon = fs[0].split(':')
                    gene_id = gene_and_exon[0]
                    if gene_id not in genes_dict:
                        continue
                    gene_symbol = genes_dict[gene_id]
                    l = '\t'.join(fs + [gene_symbol])
                    annotated_f.write(l + '\n')
    if not verify_file(ann_counts_fpath):
        critical('Could not annotate counts ' + unannotated_fpath)
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath):
    sambamba_output_by_experiment = dict()
    not_submitted_experiments = infos_by_key.values()
    while not_submitted_experiments:
        jobs_to_wait = []
        submitted_experiments = []
        reused_experiments = []

        for (group, uniq_key), e in infos_by_key.iteritems():
            if e not in not_submitted_experiments:
                continue
            sambamba_output_fpath = join(cnf.work_dir,
                                         uniq_key + '__mutations.bed')
            sambamba_output_by_experiment[e] = sambamba_output_fpath

            if cnf.reuse_intermediate and verify_file(sambamba_output_fpath,
                                                      silent=True):
                info(sambamba_output_fpath + ' exists, reusing')
                reused_experiments.append(e)
                continue
            else:
                if not e.sample.bam:
                    err('Sample ' + e.sample.name + ' in ' + str(group) +
                        ', ' + str(uniq_key) + ' has no BAM')
                    continue
                j = sambamba_depth(cnf,
                                   mut_bed_fpath,
                                   e.sample.bam,
                                   output_fpath=sambamba_output_fpath,
                                   only_depth=True,
                                   silent=True,
                                   use_grid=True)
                submitted_experiments.append(e)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_experiments = [
            e for e in not_submitted_experiments
            if e not in submitted_experiments and e not in reused_experiments
        ]

    return sambamba_output_by_experiment
Exemple #12
0
def process_all(cnf, bcbio_structure):
    samples = bcbio_structure.samples
    key_gene_by_name, use_custom_panel = get_key_or_target_bed_genes(
        cnf.bed, verify_file(adjust_system_path(cnf.key_genes), 'key genes'))
    key_or_target_genes = 'target' if use_custom_panel else 'key'
    mutations = {}
    for sample in samples:
        mutations[sample.name] = parse_mutations(cnf,
                                                 sample,
                                                 key_gene_by_name,
                                                 cnf.mutations_fpath,
                                                 key_or_target_genes,
                                                 for_flagged_report=True)
    _generate_summary_flagged_regions_report(cnf, bcbio_structure, samples,
                                             mutations, key_or_target_genes)
    pass
Exemple #13
0
def main():
    cnf, vcf2txt_res_fpath = get_args()

    info('-' * 70)
    info('Writing to ' + cnf.output_file)
    if cnf.all_transcripts_output_file:
        info('Writing info for all transcripts to ' +
             cnf.all_transcripts_output_file)
    if cnf.fm_output_file:
        info('Writing in FM format to ' + cnf.fm_output_file)
    if cnf.rejected_output_file:
        info('Writing rejected mutations to ' + cnf.rejected_output_file)

    f = Filtration(cnf)

    input_f = open(verify_file(vcf2txt_res_fpath))
    output_f = open(adjust_path(cnf.output_file), 'w')
    rejected_output_f = open(adjust_path(cnf.rejected_output_file),
                             'w') if cnf.rejected_output_file else None
    fm_output_f = open(adjust_path(cnf.fm_output_file),
                       'w') if cnf.fm_output_file else None
    all_transcripts_output_f = open(
        adjust_path(cnf.all_transcripts_output_file),
        'w') if cnf.all_transcripts_output_file else None

    info()
    info('-' * 70)
    info('Running filtering...')
    f.do_filtering(input_f, output_f, fm_output_f, all_transcripts_output_f,
                   rejected_output_f)

    input_f.close()
    output_f.close()
    if fm_output_f:
        fm_output_f.close()
    if all_transcripts_output_f:
        all_transcripts_output_f.close()

    info()
    if cnf.rejected_output_file:
        info('Rejected mutations saved to ' + cnf.rejected_output_file)
    info('Saved to ' + cnf.output_file)
Exemple #14
0
def _links_show_hide(out, samples):
    out.write(
        '<form name="tcol" onsubmit="return false">  Show columns <br/>\n')
    out.write('<table>\n')
    i = 0

    list_of_chunks = list(
        _chunks([s.name for s in samples if verify_file(s.fastqc_html_fpath)],
                6))

    for samples in list_of_chunks:
        out.write('<tr>\n')
        for sample in samples:
            out.write('<td><input type=checkbox name="col' + str(i) +
                      '"  onclick="toggleVis(' + str(i) + ')" checked> ' +
                      sample + '</td>\n')
            i += 1
        out.write('</tr>\n')
    out.write('</table>\n')
    out.write('</form> \n')
def get_rejected_mutations(cnf, bs, key_gene_by_name_chrom,
                           genes_collection_type):
    rejected_mutations = defaultdict(dict)
    rejected_mutations_by_sample = defaultdict(list)

    pass_mutations_fpath, _ = get_mutations_fpath_from_bs(bs)

    for reject_mutations_fpath in get_rejected_mutations_fpaths(
            pass_mutations_fpath):
        if verify_file(reject_mutations_fpath, silent=True):
            info('Parsing rejected mutations from ' +
                 str(reject_mutations_fpath))
            parse_mutations(cnf,
                            None,
                            key_gene_by_name_chrom,
                            reject_mutations_fpath,
                            genes_collection_type,
                            mutations_dict=rejected_mutations_by_sample)
            for sample, mutations in rejected_mutations_by_sample.iteritems():
                for mut in mutations:
                    rejected_mutations[sample][(mut.gene.name, mut.pos)] = mut
    return rejected_mutations
def draw_seq2c_plot(cnf,
                    seq2c_tsv_fpath,
                    sample_name,
                    output_dir,
                    key_gene_names=None,
                    chr_lens=None):
    info('Seq2C plot builder')
    plot_fpath = join(output_dir, sample_name + cnv_plot_ending)
    if cnf.reuse_intermediate and verify_file(plot_fpath, silent=True):
        info('Seq2C plot ' + plot_fpath + ' exists, reusing...')
        return plot_fpath

    if not verify_file(seq2c_tsv_fpath, 'Seq2C.tsv'):
        return None

    chr_names_lengths = OrderedDict(
        (chr_, l) for chr_, l in (chr_lens or get_chr_lengths(cnf))
        if '_' not in chr_)  # not drawing extra chromosomes chr1_blablabla
    chr_names = chr_names_lengths.keys()
    chr_short_names = [chrom[3:] for chrom in chr_names_lengths.keys()]
    chr_lengths = [chrom for chrom in chr_names_lengths.values()]

    fig = matplotlib.pyplot.figure(figsize=(25, 5))
    matplotlib.pyplot.xlim([0, len(chr_lengths) + 1])
    chr_cum_lens = [sum(chr_lengths[:i]) for i in range(len(chr_lengths) + 1)]
    matplotlib.pyplot.xticks(chr_cum_lens, [])

    ax = matplotlib.pyplot.gca()
    chr_names_coords = [
        chr_cum_lens[i + 1] - chr_lengths[i] / 2
        for i in range(len(chr_lengths))
    ]
    ax.xaxis.set_minor_locator(ticker.FixedLocator(chr_names_coords))
    ax.xaxis.set_minor_formatter(ticker.FixedFormatter(chr_short_names))

    # def add_rec_to_plot(chr_, start, end, log2r, max_y, min_y, marker, color, label=None):
    #     x_vals = [chr_cum_lengths[chr_names.index(chr_)] + (int(start) + int(end))/2]
    #     point_y = float(log2r)
    #     y_vals = [point_y]
    #     max_y = max(max_y, point_y)
    #     min_y = min(min_y, point_y)
    #     if label:
    #         matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2, label=label)
    #     else:
    #         matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2)
    #     return max_y, min_y

    chr_cum_len_by_chrom = dict(zip(chr_names, chr_cum_lens))
    nrm_xs = []
    nrm_ys = []
    amp_xs = []
    amp_ys = []
    amp_gs = []
    del_xs = []
    del_ys = []
    del_gs = []
    with open(seq2c_tsv_fpath) as f:
        for i, l in enumerate(f):
            if i == 0: continue
            fs = l.replace('\n', '').split('\t')
            sname, gname = fs[0], fs[1]
            if key_gene_names and gname not in key_gene_names: continue
            if sname != sample_name: continue

            sname, gname, chrom, start, end, length, log2r, sig, type_, amp_del, ab_seg, total_seg, \
                ab_log2r, log2r_diff, ab_seg_loc, ab_samples, ab_samples_pcnt = fs[:17]
            x = chr_cum_len_by_chrom[chrom] + (int(start) + int(end)) / 2

            if not ab_log2r or type_ == 'BP':  # breakpoint, meaning part of exon is not amplified
                nrm_xs.append(x)
                nrm_ys.append(float(log2r))
                # add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker='b.')

            if ab_log2r:
                y = float(ab_log2r)
                if amp_del == 'Amp':
                    amp_xs.append(x)
                    amp_ys.append(y)
                    amp_gs.append(gname)
                elif amp_del == 'Del':
                    del_xs.append(x)
                    del_ys.append(y)
                    del_gs.append(gname)
                else:
                    warn('Event is not Amp or Del, it\'s ' + amp_del)

                # max_y, min_y = add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker=color + 'o', label=gname)

                # log2r = float(log2r)
                # if -0.5 < log2r < 0.5:
                #     color = 'k'
                # elif -1.5 < log2r < 1.5:
                #     color = 'g'
                # else:
                #     color = 'r'

    matplotlib.pyplot.scatter(nrm_xs, nrm_ys, marker='.', color='k', s=1)
    matplotlib.pyplot.scatter(amp_xs, amp_ys, marker='o', color='b', s=2)
    matplotlib.pyplot.scatter(del_xs, del_ys, marker='o', color='r', s=2)
    if len(amp_xs) <= 10 or len(amp_xs) + len(del_xs) < 40:
        for x, y, g in zip(amp_xs, amp_ys, amp_gs):
            ax.text(x,
                    y,
                    g,
                    fontsize=9,
                    color='g',
                    verticalalignment='center',
                    horizontalalignment='center')
    if len(del_xs) <= 10 or len(amp_xs) + len(del_xs) < 40:
        for x, y, g in zip(del_xs, del_ys, del_gs):
            ax.text(x,
                    y,
                    g,
                    fontsize=9,
                    color='r',
                    verticalalignment='center',
                    horizontalalignment='center')

    matplotlib.pyplot.ylim(ymax=max(chain(nrm_ys, amp_ys, del_ys, [2])) * 1.05,
                           ymin=min(chain(nrm_ys, amp_ys, del_ys, [-2])) *
                           1.05)
    matplotlib.pyplot.tick_params(axis='x',
                                  which='minor',
                                  bottom='off',
                                  top='off',
                                  labelbottom='on')
    info('Saving plot to ' + plot_fpath)
    matplotlib.pyplot.tight_layout()
    fig.savefig(plot_fpath, bbox_inches='tight')
    matplotlib.pyplot.close(fig)

    info('Done')
    info('-' * 70)
    return plot_fpath
Exemple #17
0
def get_args():
    info(' '.join(sys.argv))
    info()
    description = (
        'The program will filter the VarDict output after vcf2txt.pl to '
        'candidate interpretable mutations, somatic or germline.')
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)

    parser.add_option('-o', dest='output_file')
    parser.add_option('--o-all-transcripts',
                      dest='all_transcripts_output_file')
    parser.add_option('--o-fm', dest='fm_output_file')
    parser.add_option('--o-reject', dest='rejected_output_file')

    parser.add_option('--cohort-freqs', dest='cohort_freqs_fpath')
    parser.add_option('--transcripts', dest='transcripts_fpath')

    parser.add_option('-D',
                      '--min-depth',
                      dest='filt_depth',
                      type='int',
                      help='The minimum total depth')
    parser.add_option('-V',
                      '--min-vd',
                      dest='min_vd',
                      type='int',
                      help='The minimum reads supporting variant')
    parser.add_option(
        '--gmaf',
        dest='min_gmaf',
        type='float',
        help=
        'When the GMAF is greater than specified, it\'s considered common SNP and filtered out.'
    )
    parser.add_option(
        '-f',
        '--min-freq',
        dest='min_freq',
        type='float',
        help='The minimum allele frequency for regular variants.')
    parser.add_option(
        '-F',
        '--min-freq-hs',
        '--act-min-freq',
        dest='act_min_freq',
        type='float',
        help=
        'The minimum allele frequency hotspot somatic mutations, typically lower then -f. '
        'Default: 0.01 or half -f, whichever is less')
    parser.add_option(
        '-N',
        '--keep-utr-intronic',
        dest='keep_utr_intronic',
        action='store_true',
        help=
        'Keep all intronic and UTR in the output, but will be set as "unknown".'
    )

    parser.add_option(
        '-p',
        '--platform',
        dest='platform',
        help=
        'The platform, such as WXS, WGS, RNA-Seq, VALIDATION, etc. No Default. '
        'Used for output in FM\'s format')

    parser.set_usage('Usage: ' + __file__ +
                     ' vcf2txt_res_fpath [opts] -o output_fpath')

    (opts, args) = parser.parse_args()
    if len(args) < 1:
        critical('Provide the first argument - output from vcf2txt.pl')
    logger.is_debug = opts.debug

    vcf2txt_res_fpath = verify_file(args[0], is_critical=True)

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    if not cnf.genome:
        critical('Please, specify the --genome option (e.g. --genome hg19)')

    check_genome_resources(cnf)

    if not cnf.output_file:
        critical('Please, specify the output fpath with -o')

    info()

    return cnf, vcf2txt_res_fpath
def sync_with_ngs_server(cnf,
                         jira_url,
                         project_name,
                         sample_names,
                         summary_report_fpath,
                         dataset_dirpath=None,
                         bcbio_final_dirpath=None,
                         jira_case=None):

    if is_us(): loc = us
    elif is_uk(): loc = uk
    elif is_local(): loc = local
    elif is_sweden(): loc = sweden
    else:
        return None

    html_report_url = None
    if any(p in realpath((bcbio_final_dirpath or dataset_dirpath))
           for p in loc.proper_path_should_contain):
        info('Location is ' + loc.loc_id + ', exposing reports to ' +
             loc.reports_dirpath)

        if jira_case is None and jira_case != 'unreached' and is_az(
        ) and jira_url:
            info()
            info('Getting info from JIRA...')
            jira_case = retrieve_jira_info(jira_url)

        proj_dirpath_on_server = _symlink_dirs(
            cnf=cnf,
            loc=loc,
            project_name=project_name,
            final_dirpath=bcbio_final_dirpath,
            dataset_dirpath=dataset_dirpath)
        # html_report_fpath=summary_report_fpath,
        # html_report_url=html_report_url)

        if bcbio_final_dirpath:
            html_report_url = join(
                loc.
                report_url_base,  # http://ngs.usbod.astrazeneca.net/reports/
                relpath(
                    proj_dirpath_on_server,
                    loc.reports_dirpath),  # project_name/dataset/project_name
                relpath(summary_report_fpath, dirname(bcbio_final_dirpath)
                        ))  # final/2015_01_01_project/project.html
        elif dataset_dirpath:
            html_report_url = join(
                loc.report_url_base,
                relpath(proj_dirpath_on_server, loc.reports_dirpath),
                relpath(summary_report_fpath, dataset_dirpath))

        # html_report_full_url = join(loc.website_url_base, 'samples.php?project_name=' + project_name + '&file=' + html_report_url)
        # info('HTML url: ' + html_report_full_url)

        if verify_file(loc.csv_fpath, 'Project list'):
            write_to_csv_file(work_dir=cnf.work_dir,
                              jira_case=jira_case,
                              project_list_fpath=loc.csv_fpath,
                              country_id=loc.loc_id,
                              project_name=project_name,
                              samples_num=len(sample_names),
                              analysis_dirpath=dirname(bcbio_final_dirpath)
                              if bcbio_final_dirpath else None,
                              html_report_url=html_report_url)
    return html_report_url
def main(args):
    if len(args) < 2:
        sys.exit('Usage ' + __file__ +
                 ' input.tsv bcbio.csv [dir_with_bams] [bina_dir]')

    inp_fpath = args[0]
    verify_file(args[0], is_critical=True)

    out_fpath = args[1]
    verify_dir(dirname(adjust_path(out_fpath)), is_critical=True)

    bam_dirpath = None
    if len(args) > 2:
        bam_dirpath = args[2]
        verify_dir(adjust_path(bam_dirpath), is_critical=True)

    # bam_opt = args[2]
    # try:
    #     bam_col = int(bam_opt)
    #     bam_dirpath = None
    # except ValueError:
    #     bam_col = None
    #     verify_dir(bam_opt, is_critical=True)
    #     bam_dirpath = args[2]

    bina_dirpath = None
    if len(args) > 3:
        bina_dirpath = args[3]
        verify_dir(dirname(adjust_path(bina_dirpath)), is_critical=True)

    # filtered_bams_dirpath = adjust_path(sys.argv[3])
    # verify_dir(join(filtered_bams_dirpath, os.pardir), is_critical=True)

    columns_names = 'study	barcode	disease	disease_name	sample_type	sample_type_name	analyte_type	library_type	center	center_name	platform	platform_name	assembly	filename	 files_size 	checksum	analysis_id	aliquot_id	participant_id	sample_id	tss_id	sample_accession	published	uploaded	modified	state	reason'

    samples_by_patient = defaultdict(list)

    delim = '\t'
    barcode_col = 1
    bam_col = 13
    is_tcga_tsv = True

    with open(inp_fpath) as fh:
        for i, l in enumerate(fh):
            if not l.strip():
                continue

            if i == 0:
                if len(l.split('\t')) == 27:
                    err('Interpreting as TCGA tsv')
                    if l.split('\t')[0] != 'TCGA': continue  # skipping header
                else:
                    delim = None
                    for j, f in enumerate(l.split()):
                        if f.startswith('TCGA'):
                            barcode_col = j
                            err('barcode col is ' + str(j))
                        if f.endswith('bam'):
                            bam_col = j
                            err('bam col is ' + str(j))
                    is_tcga_tsv = False

            fs = l.split(delim)

            barcode = fs[barcode_col].split(
                '-')  # TCGA-05-4244-01A-01D-1105-08

            sample = Sample()
            sample.bam = fs[bam_col]
            sample.bam_base_name = basename(os.path.splitext(fs[bam_col])[0])
            sample.description = fs[barcode_col]
            sample.patient = '-'.join(barcode[:3])
            if is_tcga_tsv:
                sample.reason = fs[26]

            sample_type = int(barcode[3][:2])
            if sample_type >= 20 or sample_type <= 0:
                continue
            sample.is_normal = 10 <= sample_type < 20
            sample.is_blood = sample_type in [
                3, 4, 9, 10
            ]  # https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm

            if any(s.description == sample.description
                   for s in samples_by_patient[sample.patient]):
                prev_sample = next(s
                                   for s in samples_by_patient[sample.patient]
                                   if s.description == sample.description)

                # comp reason
                # if 'Fileset modified' not in prev_sample.reason and 'Fileset modified' in sample.reason:
                #     err('Duplicated sample: ' + sample.description + '  Fileset modified not in old ' + prev_sample.name + ' over ' + sample.name)
                #     pass
                # elif 'Fileset modified' in prev_sample.reason and 'Fileset modified' not in sample.reason:
                #     samples_by_patient[sample.patient].remove(prev_sample)
                #     samples_by_patient[sample.patient].append(sample)
                #     err('Duplicated sample: ' + sample.description + '  Fileset modified not in new ' + sample.name + ' over ' + prev_sample.name)
                # else:
                # comp version
                prev_version = get_bam_version(prev_sample.bam_base_name)
                version = get_bam_version(sample.bam_base_name)
                err('Duplicated sample: ' + sample.description +
                    '  Resolving by version (' + ' over '.join(
                        map(str,
                            sorted([prev_version, version])[::-1])) + ')')
                if version > prev_version:
                    samples_by_patient[sample.patient].remove(prev_sample)
                    samples_by_patient[sample.patient].append(sample)
            else:
                samples_by_patient[sample.patient].append(sample)

    batches = []
    final_samples = set()

    if bina_dirpath:
        safe_mkdir(bina_dirpath)

    for patient, patient_samples in samples_by_patient.iteritems():
        tumours = [s for s in patient_samples if not s.is_normal]
        normals = [s for s in patient_samples if s.is_normal]

        main_normal = None
        if len(normals) >= 1:
            if any(n.is_blood for n in normals):
                main_normal = next(n for n in normals if n.is_blood)
            else:
                main_normal = normals[0]
                if tumours:
                    for n in normals[1:]:
                        b = Batch(n.description + '-batch')
                        b.tumour = n
                        batches.append(b)

        for t in tumours:
            b = Batch(t.description + '-batch')
            b.tumour = t
            t.batches.add(b)
            final_samples.add(t)
            if main_normal:
                b.normal = main_normal
                main_normal.batches.add(b)
                final_samples.add(main_normal)
            batches.append(b)

        ##################
        ###### Bina ######
        if bina_dirpath:
            bina_patient_dirpath = join(bina_dirpath, patient)
            safe_mkdir(bina_patient_dirpath)
            normals_csv_fpath = join(bina_patient_dirpath, 'normals.csv')
            tumours_csv_fpath = join(bina_patient_dirpath, 'tumors.csv')

            if main_normal:
                with open(normals_csv_fpath, 'w') as f:
                    f.write('name,bam\n')
                    bam_fpath = join(
                        bam_dirpath,
                        main_normal.bam) if bam_dirpath else main_normal.bam
                    f.write(main_normal.description + ',' + bam_fpath + '\n')

            with open(tumours_csv_fpath, 'w') as f:
                f.write('name,bam\n')
                for t in tumours:
                    bam_fpath = join(bam_dirpath,
                                     t.bam) if bam_dirpath else t.bam
                    f.write(t.description + ',' + bam_fpath + '\n')

    if bina_dirpath:
        err('Saved bina CSVs to ' + bina_dirpath)

    ###########################
    ######## Bcbio CSV ########
    print 'bcbio_nextgen.py -w template bcbio.yaml', out_fpath,
    with open(out_fpath, 'w') as out:
        out.write('sample,description,batch,phenotype\n')
        for s in sorted(final_samples, key=lambda s: s.bam_base_name):
            out.write(','.join([
                s.bam_base_name, s.description, ';'.join(
                    sorted(b.name for b in s.batches)),
                ('normal' if s.is_normal else 'tumor')
            ]) + '\n')
            bam_fpath = join(bam_dirpath, s.bam) if bam_dirpath else s.bam

            if verify_bam(bam_fpath, is_critical=False):
                try:
                    bam = pysam.Samfile(bam_fpath, "rb")
                except ValueError:
                    err(traceback.format_exc())
                    err('Cannot read ' + bam_fpath)
                    err()
                    # n_rgs = max(1, len(bam.header.get("RG", [])))
                else:
                    print bam_fpath,