def make_key_genes_cov_report(experiment_by_key):
        info('Making key genes coverage report...')

        ms = [
            Metric('Gene'),
            Metric('Chr', with_heatmap=False, max_width=20, align='right')
        ]

        for i, (k, e) in enumerate(experiment_by_key.items()):
            ms.extend([
                Metric(k + ' Ave depth',
                       short_name=k + '\nave depth',
                       med=e.ave_depth,
                       class_='shifted_column' if i == 0 else ''),
                Metric(k + ' % cov at {}x'.format(e.depth_cutoff),
                       short_name='% at {}x'.format(e.depth_cutoff),
                       unit='%',
                       med=1,
                       low_inner_fence=0.5,
                       low_outer_fence=0.1),
                Metric(k + ' CNV', short_name='  CNV')
            ]  # short name is hack for IE9 who doesn't have "text-align: left" and tries to stick "CNV" to the previous col header
                      )
        clinical_cov_metric_storage = MetricStorage(
            sections=[ReportSection(metrics=ms)])
        key_genes_report = PerRegionSampleReport(
            sample=experiment_by_key.values()[0].sample,
            metric_storage=clinical_cov_metric_storage)

        # Writing records
        hits_by_gene_by_experiment = OrderedDefaultDict(OrderedDict)
        for k, e in experiment_by_key.items():
            for gene in e.key_gene_by_name.values():
                hits_by_gene_by_experiment[gene.name][e] = gene

        for gname, hit_by_experiment in sorted(
                hits_by_gene_by_experiment.items(), key=lambda
            (gname, h): gname):
            gene = next(
                (m for m in hit_by_experiment.values() if m is not None), None)

            row = key_genes_report.add_row()
            row.add_record('Gene', gene.name)
            row.add_record('Chr', gene.chrom.replace('chr', ''))

            for e, hit in hit_by_experiment.items():
                row.add_record(e.key + ' Ave depth', hit.ave_depth)
                m = clinical_cov_metric_storage.find_metric(
                    e.key + ' % cov at {}x'.format(e.depth_cutoff))
                row.add_record(
                    m.name,
                    next((cov for cutoff, cov in hit.cov_by_threshs.items()
                          if cutoff == e.depth_cutoff), None))
                if hit.seq2c_event and (hit.seq2c_event.is_amp()
                                        or hit.seq2c_event.is_del()):
                    row.add_record(
                        e.key + ' CNV', hit.seq2c_event.amp_del + ', ' +
                        hit.seq2c_event.fragment)

        return key_genes_report
def main():
    cnf = read_opts_and_cnfs(
        description='Plotting Seq2C results.',
        extra_opts=[
            (['--seq2c-results'], dict(
                dest='seq2c_tsv_fpath')
             ),
            (['--key-genes'], dict(
                dest='key_genes_fpath')
             ),
        ],
        required_keys=['seq2c_tsv_fpath', 'output_dir'],
        file_keys=['seq2c_tsv_fpath', 'key_genes'],
        key_for_sample_name=None,
    )
    check_system_resources(cnf)
    check_genome_resources(cnf)

    key_gene_names = None
    if cnf.key_genes_fpath:
        with open(cnf.key_genes_fpath) as f:
            key_gene_names = set([l.strip() for l in f.readlines() if l.strip() != ''])

    plot_fpath = draw_seq2c_plot(cnf, cnf.seq2c_tsv_fpath, cnf.sample, cnf.output_dir, key_gene_names)
    if plot_fpath:
        info('Saved plot to ' + plot_fpath)
def add_data_query_properties(cnf, study_name, properties_fpath, data_fpath, info_fpath):
    # modify properties
    properties_lines = []
    text_to_add = None

    lines = open(properties_fpath).read().split('\n')
    for l in lines:
        l = l.strip()
        if 'studies=' in l:
            studies = l.split('=')[1].split(';')
            if study_name not in studies:
                l += ';' + study_name
        if 'study2desc' in l:
            text_to_add = '{study_name} => "{study_name}", \\'.format(**locals())
        if 'study2data' in l:
            text_to_add = '{study_name} => "{data_fpath}", \\'.format(**locals())
        if 'study2info' in l:
            text_to_add = '{study_name} => "{info_fpath}", \\'.format(**locals())
        if study_name + ' => ' in l:
            info(l.strip() + ' already present in properties, removing it.')
            continue
        if l == '}' and text_to_add and text_to_add not in properties_lines:
            properties_lines.append(text_to_add)
            text_to_add = None
        properties_lines.append(l)

    with file_transaction(cnf.work_dir, properties_fpath) as tx:
        with open(tx, 'w') as out:
            for l in properties_lines:
                out.write(l + '\n')
Example #4
0
 def function_timer(*args, **kwargs):
     t0 = time.time()
     result = function(*args, **kwargs)
     t1 = time.time()
     info('Total time running %s: %s seconds' %
          (function.func_name, str(t1 - t0)))
     return result
def proc_args():
    info(' '.join(sys.argv))
    info()

    cnf, bcbio_structure = bcbio_summary_script_proc_params(
        BCBioStructure.seq2c_name,
        extra_opts=[
           (['--bed', '--capture', '--amplicons'], dict(
                dest='bed'
           ))
        ],
    )
    return cnf, bcbio_structure
def parse_seq2c(seq2c_tsv_fpath, altered_genes, key_gene_by_name_chrom):
    seq2c_events_by_sample = defaultdict(list)
    
    if not seq2c_tsv_fpath or not verify_file(seq2c_tsv_fpath):
        return seq2c_events_by_sample, altered_genes

    info('Parsing Seq2C from ' + seq2c_tsv_fpath)

    if seq2c_tsv_fpath and verify_file(seq2c_tsv_fpath):
        with open(seq2c_tsv_fpath) as f_inp:
            for i, l in enumerate(f_inp):
                if i == 0: continue
                fs = l.replace('\n', '').split('\t')
                sname, gname, chrom = fs[0], fs[1], fs[2]
                if (gname, chrom) not in key_gene_by_name_chrom: continue

                sname, gname, chrom, start, end, length, log2r, sig, fragment, amp_del, ab_seg, total_seg, \
                    ab_log2r, log2r_diff, ab_seg_loc, ab_samples, ab_samples_pcnt = fs[:17]
                if not amp_del:
                    continue
                if fragment == 'BP':
                    exons = str(ab_seg) + ' of ' + total_seg
                    copy_number = round(2 ** float(ab_log2r) * 2, 2)
                    if copy_number > 2:
                        copy_number = round(copy_number, 1)
                else:
                    exons = 'Whole'
                    if amp_del == 'Amp':
                        ab_log2r = copy_number = 'AMP'
                    else:
                        ab_log2r = copy_number = 'HOMDEL'

                cnv_type = 'amplification' if amp_del == 'Amp' else 'loss'
                event = OncoprintSeq2CEvent(
                    gene=gname,
                    copy_number=str(copy_number),
                    exons=exons,
                    ratio=ab_log2r,
                    cnv_type=cnv_type)
                seq2c_events_by_sample[sname].append(event)
                altered_genes.add(gname)
                
    return seq2c_events_by_sample, altered_genes
def get_rejected_mutations(cnf, bs, key_gene_by_name_chrom,
                           genes_collection_type):
    rejected_mutations = defaultdict(dict)
    rejected_mutations_by_sample = defaultdict(list)

    pass_mutations_fpath, _ = get_mutations_fpath_from_bs(bs)

    for reject_mutations_fpath in get_rejected_mutations_fpaths(
            pass_mutations_fpath):
        if verify_file(reject_mutations_fpath, silent=True):
            info('Parsing rejected mutations from ' +
                 str(reject_mutations_fpath))
            parse_mutations(cnf,
                            None,
                            key_gene_by_name_chrom,
                            reject_mutations_fpath,
                            genes_collection_type,
                            mutations_dict=rejected_mutations_by_sample)
            for sample, mutations in rejected_mutations_by_sample.iteritems():
                for mut in mutations:
                    rejected_mutations[sample][(mut.gene.name, mut.pos)] = mut
    return rejected_mutations
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int',
        help='Downsample reads to avoid excessive processing times with large files. '
            'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()

    if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir:
        parser.print_usage()

    verify_file(opts.left_reads_fpath, is_critical=False)
    left_reads_fpath = adjust_path(opts.left_reads_fpath)
    verify_file(opts.right_reads_fpath, is_critical=False)
    right_reads_fpath = adjust_path(opts.right_reads_fpath)
    output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True)

    left_reads_fpath, right_reads_fpath, output_dirpath =\
        map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath])

    ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******')
    fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py')
    fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM)
    fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM)

    cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}'
    if opts.sample_name:
        cmdl += ' --sample {opts.sample_name}'
    if opts.downsample_to:
        cmdl += ' --downsample-to ' + str(int(opts.downsample_to))
    cmdl = cmdl.format(**locals())
    cmdl += ' 2>&1'
    info(cmdl)
    stdin, stdout, stderr = ssh.exec_command(cmdl)
    for l in stdout:
        err(l, ending='')
    info()
    ssh.close()
Example #9
0
def make_gene_expression_heatmaps(cnf,
                                  bcbio_structure,
                                  counts_fpath,
                                  genes_dict,
                                  report_fpath,
                                  report_name,
                                  keep_gene_names=False):
    # key_gene_names = get_key_genes(verify_file(adjust_system_path(cnf.key_genes), 'key genes'))
    gene_counts, samples = parse_gene_counts(
        counts_fpath, None, report_name[0].lower() + report_name[1:],
        keep_gene_names)
    report = make_expression_heatmap(bcbio_structure, gene_counts)
    counts_fname = basename(counts_fpath)
    counts_url = relpath(counts_fpath, report_fpath)
    counts_link = (
        'Showing genes with count >=' + str(HEATMAPS_MIN_COUNT) +
        ' at least in one sample. ' +
        'The full results can be downloaded from here: <a href="{counts_url}" target="_blank">{counts_fname}</a>'
    ).format(**locals())
    data_dict = {'file_link': counts_link}
    BaseReport.save_html(report,
                         cnf,
                         report_fpath,
                         caption=report_name,
                         extra_js_fpaths=[
                             join(dirname(abspath(__file__)), 'static',
                                  'rnaseq_heatmaps.js')
                         ],
                         extra_css_fpaths=[
                             join(dirname(abspath(__file__)), 'static',
                                  'rnaseq.css')
                         ],
                         tmpl_fpath=join(dirname(abspath(__file__)),
                                         'template.html'),
                         data_dict=data_dict)
    info(report_name + ' heatmap saved in ' + report_fpath)
    return
Example #10
0
def parse_gene_counts(counts_fpath, key_gene_names, report_name,
                      keep_gene_names):
    gene_counts = defaultdict(list)
    info('Preparing ' + report_name + ' stats for expression heatmaps')
    info('Checking ' + counts_fpath)
    if not verify_file(counts_fpath):
        err('Cannot find ' + report_name + ' fpath')
        return []

    info('Reading ' + report_name + ' from ' + counts_fpath)
    samples_cols = dict()
    samples = []
    gene_col = None

    with open(counts_fpath) as f:
        for i, l in enumerate(f):
            if i == 0:
                header = l.strip().split('\t')
                gene_col = header.index('HUGO')
                samples = header[1:gene_col]
                samples_cols = {
                    sample: col + 1
                    for col, sample in enumerate(samples)
                }
                continue
            fs = l.replace('\n', '').split('\t')
            gene_name = fs[gene_col]
            if key_gene_names and gene_name not in key_gene_names:
                continue
            gene_expression_dict = {
                sample: int(float(fs[col]))
                if float(fs[col]).is_integer() else float(fs[col])
                for sample, col in samples_cols.iteritems()
            }
            if all(v < HEATMAPS_MIN_COUNT
                   for v in gene_expression_dict.values()):
                continue
            is_hidden_row = False
            name = gene_name
            if ':' in fs[0]:  ## exon number
                is_hidden_row = True
                exon_number = fs[0].split(':')[1]
                name += ':' + exon_number
            if keep_gene_names:
                is_hidden_row = True
                name = fs[0]  # use id
            gene = Counts(name,
                          gene_name=gene_name,
                          counts=gene_expression_dict,
                          is_hidden_row=is_hidden_row)
            gene_counts[gene_name].append(gene)

    return gene_counts, samples
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('--suffix',
                      dest='suffix',
                      default='subset',
                      help='Output files suffix')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=5e5,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(
        opts.right_reads_fpath,
        is_critical=True) if opts.right_reads_fpath else None
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    safe_mkdir(output_dirpath)
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        info('Downsampling to ' + str(cnf.downsample_to))
        downsample(cnf,
                   cnf.sample_name,
                   left_reads_fpath,
                   right_reads_fpath,
                   cnf.downsample_to,
                   output_dir=cnf.output_dir,
                   suffix=cnf.suffix)
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath):
    sambamba_output_by_experiment = dict()
    not_submitted_experiments = infos_by_key.values()
    while not_submitted_experiments:
        jobs_to_wait = []
        submitted_experiments = []
        reused_experiments = []

        for (group, uniq_key), e in infos_by_key.iteritems():
            if e not in not_submitted_experiments:
                continue
            sambamba_output_fpath = join(cnf.work_dir,
                                         uniq_key + '__mutations.bed')
            sambamba_output_by_experiment[e] = sambamba_output_fpath

            if cnf.reuse_intermediate and verify_file(sambamba_output_fpath,
                                                      silent=True):
                info(sambamba_output_fpath + ' exists, reusing')
                reused_experiments.append(e)
                continue
            else:
                if not e.sample.bam:
                    err('Sample ' + e.sample.name + ' in ' + str(group) +
                        ', ' + str(uniq_key) + ' has no BAM')
                    continue
                j = sambamba_depth(cnf,
                                   mut_bed_fpath,
                                   e.sample.bam,
                                   output_fpath=sambamba_output_fpath,
                                   only_depth=True,
                                   silent=True,
                                   use_grid=True)
                submitted_experiments.append(e)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_experiments = [
            e for e in not_submitted_experiments
            if e not in submitted_experiments and e not in reused_experiments
        ]

    return sambamba_output_by_experiment
Example #13
0
def run_fastq(cnf,
              sample_name,
              l_r_fpath,
              r_r_fpath,
              output_dirpath,
              downsample_to=1e7):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)

    if downsample_to:
        info('Downsampling to ' + str(downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample_name,
                                      l_r_fpath,
                                      r_r_fpath,
                                      downsample_to,
                                      output_dir=cnf.work_dir)

    # Joining fastq files to run on a combination
    fastqc_fpath = join(cnf.work_dir, sample_name + '.fq')
    info('Combining fastqs, writing to ' + fastqc_fpath)
    with open(fastqc_fpath, 'w') as out:
        out.write(open_gzipsafe(l_r_fpath).read())
        out.write(open_gzipsafe(r_r_fpath).read())

    # Running FastQC
    info('Running FastQC')
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format(
        **locals())
    call(cnf, cmdline)

    # Cleaning and getting report
    sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc')
    if isfile(sample_fastqc_dirpath + '.zip'):
        os.remove(sample_fastqc_dirpath + '.zip')
    fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html')
    verify_file(fastqc_html_fpath, is_critical=True)

    return sample_fastqc_dirpath
def run_clinical_target2wgs(cnf, wgs_bs, trg_bs, shared_sample_names,
                            output_dirpath):
    info('Running clinical reporting comparison')

    for sname in shared_sample_names:
        info('Preparing ' + sname + '...')
        trg_sample = next(s for s in trg_bs.samples if s.name == sname)
        wgs_sample = next(s for s in wgs_bs.samples if s.name == sname)

        info('-' * 70)
        clin_trg_info = clinical_sample_info_from_bcbio_structure(
            cnf, trg_bs, trg_sample, is_target2wgs_comparison=True)
        info('')
        info('-' * 70)
        clin_wgs_info = clinical_sample_info_from_bcbio_structure(
            cnf, wgs_bs, wgs_sample, is_target2wgs_comparison=True)

        info('')
        info('*' * 70)
        infos_by_key = {'Target': clin_trg_info, 'WGS': clin_wgs_info}
        run_sample_combine_clinreport(cnf,
                                      infos_by_key,
                                      output_dirpath,
                                      is_target2wgs=True)
        info('*' * 70)
        info('Successfully finished.')
def parse_sv_files(cnf, samples, altered_genes, key_gene_by_name_chrom):
    sv_events_by_samples = defaultdict(set)
    sv_fpaths = [sample.find_sv_fpath() for sample in samples]
    sv_fpaths = [f for f in sv_fpaths if f]

    if not sv_fpaths:
        return sv_events_by_samples, altered_genes

    chr_order = get_chrom_order(cnf)
    all_events = dict()

    with open(cnf.transcripts_fpath) as f:
        transcripts = [tr.strip() for tr in f]

    for sv_fpath in sv_fpaths:
        info('Parsing prioritized SV from ' + sv_fpath)
        sample_col = None
        known_col = None
        with open(sv_fpath) as f:
            header_rows = []
            for i, l in enumerate(f):
                fs = l.strip().split('\t')
                if i == 0:
                    # header_rows = fs  # caller  sample  chrom  start  end  svtype  known  end_gene  lof  annotation  split_read_support  paired_end_support
                    header_rows = fs  # caller  sample  chrom  start  end  svtype                   lof  annotation  split_read_support  paired_support_PE   paired_support_PR
                    sample_col = header_rows.index('sample')
                    # known_col = header_rows.index('known')
                else:
                    event = SVEvent.parse_sv_event(chr_order, key_gene_by_name_chrom, transcripts,  **dict(zip(header_rows, fs)))
                    sample = fs[sample_col]
                    if event:
                        all_events[(sample, event.id)] = event
                        for annotation in event.annotations:
                            if event.is_fusion() or event.is_known_fusion(annotation) or annotation.effect == 'EXON_DEL':
                                if event.end:
                                    event.chrom2 = event.chrom
                                if event.is_known_fusion(annotation):
                                    annotation.known = True
                                key_altered_genes = [g for g in annotation.genes if (g, event.chrom) in key_gene_by_name_chrom]
                                if (annotation.effect == 'FUSION' or annotation.effect == 'EXON_DEL' or annotation.known) \
                                        and key_altered_genes:
                                    annotation.event = event
                                    event.key_annotations.add(annotation)
                                    # event.supplementary = '-with-' in fs[known_col]
                                    sv_events_by_samples[sample].add(event)
                                    for g in key_altered_genes:
                                        altered_genes.add(g)

    sv_anns_by_samples = dict()
    for sample, events in sv_events_by_samples.iteritems():  # combine two annotations of fusion in one
        # suppl_events = {e.mate_id: e for e in events if e.supplementary}
        # main_events = [e for e in events if not e.supplementary]
        # for event in main_events:
            # if event.id not in suppl_events:
            #     continue
            # suppl_event = suppl_events[event.id]
            # event.end = suppl_event.chrom + ':' + str(suppl_event.start)
        sv_anns_by_key = OrderedDefaultDict(SVEvent.Annotation)
        for event in events:
            if not event.end and event.mate_id:
                event_mate = all_events[(sample, event.mate_id)]
                event.end = event_mate.start
                event.chrom2 = event_mate.chrom
            for an in event.key_annotations:
                sv_anns_by_key[an.get_key()].update_annotation(an)
        sv_anns_by_samples[sample] = sv_anns_by_key.values()

    return sv_anns_by_samples, altered_genes
def parse_mutations(mutations_fpath, altered_genes, key_gene_by_name_chrom):
    mut_by_samples = defaultdict(list)
    
    if not mutations_fpath or not verify_file(mutations_fpath):
        return mut_by_samples, altered_genes

    info('Parsing mutations from ' + mutations_fpath)
    
    sample_col = None
    chr_col = None
    pos_col = None
    type_col = None
    allele_freq_col = None
    gene_col = None
    depth_col = None
    aa_chg_col = None
    cdna_chg_col = None
    status_col = None
    signif_col = None
    incidentalome_col = None

    stop_gain_pattern = re.compile('^[A-Z]+\d+\*')
    fs_pattern = re.compile('^[A-Z]+(\d+)fs')
    aa_chg_pattern = re.compile('^([A-Z]\d+)[A-Z]$')
    
    with open(mutations_fpath) as txt:
        for i, l in enumerate(txt):
            l = l.replace('\n', '')
            if not l:
                continue
            if i == 0:
                header = l.split('\t')
                sample_col = header.index('Sample')
                chr_col = header.index('Chr')
                pos_col = header.index('Start')
                type_col = header.index('Type')
                allele_freq_col = header.index('AlleleFreq')
                gene_col = header.index('Gene')
                aa_chg_col = header.index('Amino_Acid_Change')
                cdna_chg_col = header.index('cDNA_Change')
                depth_col = header.index('Depth')
                if 'Status' in header:
                    status_col = header.index('Status')
                if 'Significance' in header:
                    signif_col = header.index('Significance')
                else:
                    signif_col = len(header) - header[::-1].index('Status') - 1  # get last index of status
                if 'Incidentalome' in header:
                    incidentalome_col = header.index('Incidentalome')
                continue
            fs = l.replace('\n', '').split('\t')
            sample, gene, chrom, pos, type_ = fs[sample_col], fs[gene_col], fs[chr_col], fs[pos_col], fs[type_col]
            if (gene, chrom) not in key_gene_by_name_chrom:
                continue
            mut = OncoprintMutation(chrom, pos, gene)
            mut.aa_change, mut.cdna_change, mut.depth, mut.freq = fs[aa_chg_col], fs[cdna_chg_col], fs[depth_col], float(fs[allele_freq_col])
            mut.status = fs[status_col] if status_col is not None else None
            mut.signif = fs[signif_col] if signif_col is not None else None
            incidentalome_reason = fs[incidentalome_col] if incidentalome_col is not None else None
            if incidentalome_reason:
                continue
            mut.type = 'Known' if mut.signif != 'unknown' else 'Unknown'
            if 'splice' in type_:
                mut.type = 'Splice'
            elif stop_gain_pattern.match(mut.aa_change):
                mut.type = 'Trunc/FS'
            elif fs_pattern.match(mut.aa_change):
                mut.type = 'Trunc/FS'
            elif mut.aa_change.startswith('-'):
                mut.type = 'Trunc/FS'
            elif 'missense' in type_:
                mut.type += '-Missense'
            elif 'ins' in mut.aa_change or 'del' in mut.aa_change:
                mut.type += '-Indel'
            else:
                mut.type += '-Other'
            mut_by_samples[sample].append(mut)
            altered_genes.add(gene)

    return mut_by_samples, altered_genes
Example #17
0
def downsample(cnf,
               sample_name,
               fastq_L_fpath,
               fastq_R_fpath,
               N,
               output_dir,
               suffix=None,
               quick=False):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    quick=True will just grab the first N reads rather than do a true
    downsampling
    """
    sample_name = sample_name or splitext(''.join(
        lc if lc == rc else ''
        for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0]

    l_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_L_fpath), suffix or 'subset'))
    r_out_fpath = join(output_dir,
                       add_suffix(basename(fastq_R_fpath), suffix or 'subset'))
    if cnf.reuse_intermediate and verify_file(
            l_out_fpath, silent=True) and verify_file(r_out_fpath,
                                                      silent=True):
        info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.')
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    N = int(N)
    records_num = N
    if quick:
        rand_records = range(N)
    else:
        info(sample_name + ': getting number of reads in fastq...')
        records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4
        if records_num > LIMIT:
            info(sample_name + ' the number of reads is higher than ' +
                 str(LIMIT) + ', sampling from only first ' + str(LIMIT))
            records_num = LIMIT
        info(sample_name + ': ' + str(records_num) + ' reads')
        if records_num < N:
            info(sample_name + ': and it is less than ' + str(N) +
                 ', so no downsampling.')
            return fastq_L_fpath, fastq_R_fpath
        else:
            info(sample_name + ': downsampling to ' + str(N))
            rand_records = sorted(random.sample(xrange(records_num), N))

    info('Opening ' + fastq_L_fpath)
    fh1 = open_gzipsafe(fastq_L_fpath)
    info('Opening ' + fastq_R_fpath)
    fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath)

    written_records = 0
    with file_transaction(cnf.work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, basestring):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4):
                    fh1.readline()
                if fh2:
                    for i in range(4):
                        fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            rec_no += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) +
                     ', rec_no ' + str(rec_no))
            if rec_no > records_num:
                info(sample_name + ' reached the limit of ' + str(records_num),
                     ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) +
             ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_R_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath +
         ' and ' + r_out_fpath + ', total ' + str(written_records) +
         ' paired reads written')
    return l_out_fpath, r_out_fpath
def run_combine_clinical_reports(cnf, bcbio_structures, parameters_info,
                                 samples_data):
    info('Running clinical reporting comparison')

    infos_by_key = OrderedDict()
    sample_names = [s.name for bs in bcbio_structures for s in bs.samples]
    for i, bs in enumerate(bcbio_structures):
        info()
        info('Preparing ' + bs.project_name + '...')
        info('-' * 70)
        for sample in bs.samples:
            if not cnf.sample_names or (cnf.sample_names
                                        and sample.name in cnf.sample_names):
                info('Preparing ' + sample.name + '...')
                info('-' * 70)
                # sample.targetcov_detailed_tsv = None
                clin_info = clinical_sample_info_from_bcbio_structure(
                    cnf, bs, sample)
                group = samples_data[bs.bcbio_project_dirpath][
                    sample.name].group
                uniq_key = get_uniq_sample_key(bs.project_name, sample,
                                               sample_names)
                infos_by_key[(group, uniq_key)] = clin_info
                info('')

        rejected_mutations = get_rejected_mutations(
            cnf, bs, clin_info.key_gene_by_name_chrom,
            clin_info.genes_collection_type)
        for sample in bs.samples:
            if not cnf.sample_names or (cnf.sample_names
                                        and sample.name in cnf.sample_names):
                group = samples_data[bs.bcbio_project_dirpath][
                    sample.name].group
                uniq_key = get_uniq_sample_key(bs.project_name, sample,
                                               sample_names)
                infos_by_key[(
                    group, uniq_key
                )].rejected_mutations = rejected_mutations[sample.name]

    sample_infos = OrderedDict({
        k: get_sample_info(e.sample.name, e.sample.dirpath, samples_data)
        for k, e in infos_by_key.iteritems()
    })
    sorted_sample_infos = sorted(sample_infos.items(),
                                 key=lambda x:
                                 ([x[1][j]
                                   for j in range(len(x[1]))], x[0][1]))
    sorted_experiments = OrderedDict()
    for k, v in sorted_sample_infos:
        sorted_experiments[k] = infos_by_key[k]
    save_all_mutations_depth(cnf, infos_by_key)

    info('*' * 70)
    run_sample_combine_clinreport(cnf, infos_by_key, cnf.output_dir,
                                  parameters_info, samples_data)
    info('*' * 70)
    info('Successfully finished.')
    def write_report(self,
                     output_fpath,
                     is_target2wgs=False,
                     sample_names=None):
        info('')

        data = {
            'key_or_target':
            self.experiment_by_key.values()[0].genes_collection_type,
            'genes_description':
            self.experiment_by_key.values()[0].genes_description,
            'sample': {
                'experiments': [
                    self.sample_section(e,
                                        sample_name=e.sample.name +
                                        (', ' + sample_names[e] if sample_names
                                         and e in sample_names else ''))
                    for k, e in self.experiment_by_key.items()
                ],
            },
            # 'patient': self.__patient_section(self.patient),
            # 'sample_name': self.sample_name,
            'variants':
            self.__mutations_section(self.mutations_report,
                                     self.experiment_by_key),
            'coverage':
            self.__coverage_section(self, self.key_genes_report,
                                    self.cov_plot_data),
            # 'actionable_genes': self.__actionable_genes_section()
        }

        min_af = self.cnf.min_af or 0
        data['min_af'] = str(float(min_af) * 100)
        if self.seq2c_report:
            data['seq2c'] = {'amp_del': self.seq2c_section()}
        if self.seq2c_plot_data:
            data['seq2c']['plot'] = {'plot_data': self.seq2c_plot_data}
        if data['variants']:
            data['variants']['venn_diagram'] = {
                'diagram_data': self.venn_plot_data
            }
            data['variants']['mut_parameters'] = self.mutations_parameters
        write_static_html_report(
            self.cnf,
            data,
            output_fpath,
            tmpl_fpath=join(
                dirname(abspath(__file__)), 'template_target2wgs.html'
                if is_target2wgs else 'template_combine.html'),
            extra_js_fpaths=[
                join(dirname(abspath(__file__)), 'static',
                     'clinical_report.js'),
                join(dirname(abspath(__file__)), 'static',
                     'combined_clinical_report.js'),
                join(dirname(abspath(__file__)), 'static',
                     'draw_genes_coverage_plot.js'),
                join(dirname(abspath(__file__)), 'static',
                     'draw_mutations_plot.js'),
                join(dirname(abspath(__file__)), 'static', 'd3.min.js'),
                join(dirname(abspath(__file__)), 'static', 'venn.js'),
                join(dirname(abspath(__file__)), 'static',
                     'draw_venn_diagram.js'),
                join(dirname(abspath(__file__)), 'static',
                     'draw_substitutions_plot.js'),
                join(dirname(abspath(__file__)), 'static',
                     'draw_seq2c_plot.js')
            ],
            extra_css_fpaths=[
                join(dirname(abspath(__file__)), 'static',
                     'clinical_report.css'),
                join(dirname(abspath(__file__)), 'static',
                     'header_picture.css')
            ])

        info('Saved clinical report to ' + output_fpath)
        info('-' * 70)
        info()
        return output_fpath
def draw_seq2c_plot(cnf,
                    seq2c_tsv_fpath,
                    sample_name,
                    output_dir,
                    key_gene_names=None,
                    chr_lens=None):
    info('Seq2C plot builder')
    plot_fpath = join(output_dir, sample_name + cnv_plot_ending)
    if cnf.reuse_intermediate and verify_file(plot_fpath, silent=True):
        info('Seq2C plot ' + plot_fpath + ' exists, reusing...')
        return plot_fpath

    if not verify_file(seq2c_tsv_fpath, 'Seq2C.tsv'):
        return None

    chr_names_lengths = OrderedDict(
        (chr_, l) for chr_, l in (chr_lens or get_chr_lengths(cnf))
        if '_' not in chr_)  # not drawing extra chromosomes chr1_blablabla
    chr_names = chr_names_lengths.keys()
    chr_short_names = [chrom[3:] for chrom in chr_names_lengths.keys()]
    chr_lengths = [chrom for chrom in chr_names_lengths.values()]

    fig = matplotlib.pyplot.figure(figsize=(25, 5))
    matplotlib.pyplot.xlim([0, len(chr_lengths) + 1])
    chr_cum_lens = [sum(chr_lengths[:i]) for i in range(len(chr_lengths) + 1)]
    matplotlib.pyplot.xticks(chr_cum_lens, [])

    ax = matplotlib.pyplot.gca()
    chr_names_coords = [
        chr_cum_lens[i + 1] - chr_lengths[i] / 2
        for i in range(len(chr_lengths))
    ]
    ax.xaxis.set_minor_locator(ticker.FixedLocator(chr_names_coords))
    ax.xaxis.set_minor_formatter(ticker.FixedFormatter(chr_short_names))

    # def add_rec_to_plot(chr_, start, end, log2r, max_y, min_y, marker, color, label=None):
    #     x_vals = [chr_cum_lengths[chr_names.index(chr_)] + (int(start) + int(end))/2]
    #     point_y = float(log2r)
    #     y_vals = [point_y]
    #     max_y = max(max_y, point_y)
    #     min_y = min(min_y, point_y)
    #     if label:
    #         matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2, label=label)
    #     else:
    #         matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2)
    #     return max_y, min_y

    chr_cum_len_by_chrom = dict(zip(chr_names, chr_cum_lens))
    nrm_xs = []
    nrm_ys = []
    amp_xs = []
    amp_ys = []
    amp_gs = []
    del_xs = []
    del_ys = []
    del_gs = []
    with open(seq2c_tsv_fpath) as f:
        for i, l in enumerate(f):
            if i == 0: continue
            fs = l.replace('\n', '').split('\t')
            sname, gname = fs[0], fs[1]
            if key_gene_names and gname not in key_gene_names: continue
            if sname != sample_name: continue

            sname, gname, chrom, start, end, length, log2r, sig, type_, amp_del, ab_seg, total_seg, \
                ab_log2r, log2r_diff, ab_seg_loc, ab_samples, ab_samples_pcnt = fs[:17]
            x = chr_cum_len_by_chrom[chrom] + (int(start) + int(end)) / 2

            if not ab_log2r or type_ == 'BP':  # breakpoint, meaning part of exon is not amplified
                nrm_xs.append(x)
                nrm_ys.append(float(log2r))
                # add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker='b.')

            if ab_log2r:
                y = float(ab_log2r)
                if amp_del == 'Amp':
                    amp_xs.append(x)
                    amp_ys.append(y)
                    amp_gs.append(gname)
                elif amp_del == 'Del':
                    del_xs.append(x)
                    del_ys.append(y)
                    del_gs.append(gname)
                else:
                    warn('Event is not Amp or Del, it\'s ' + amp_del)

                # max_y, min_y = add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker=color + 'o', label=gname)

                # log2r = float(log2r)
                # if -0.5 < log2r < 0.5:
                #     color = 'k'
                # elif -1.5 < log2r < 1.5:
                #     color = 'g'
                # else:
                #     color = 'r'

    matplotlib.pyplot.scatter(nrm_xs, nrm_ys, marker='.', color='k', s=1)
    matplotlib.pyplot.scatter(amp_xs, amp_ys, marker='o', color='b', s=2)
    matplotlib.pyplot.scatter(del_xs, del_ys, marker='o', color='r', s=2)
    if len(amp_xs) <= 10 or len(amp_xs) + len(del_xs) < 40:
        for x, y, g in zip(amp_xs, amp_ys, amp_gs):
            ax.text(x,
                    y,
                    g,
                    fontsize=9,
                    color='g',
                    verticalalignment='center',
                    horizontalalignment='center')
    if len(del_xs) <= 10 or len(amp_xs) + len(del_xs) < 40:
        for x, y, g in zip(del_xs, del_ys, del_gs):
            ax.text(x,
                    y,
                    g,
                    fontsize=9,
                    color='r',
                    verticalalignment='center',
                    horizontalalignment='center')

    matplotlib.pyplot.ylim(ymax=max(chain(nrm_ys, amp_ys, del_ys, [2])) * 1.05,
                           ymin=min(chain(nrm_ys, amp_ys, del_ys, [-2])) *
                           1.05)
    matplotlib.pyplot.tick_params(axis='x',
                                  which='minor',
                                  bottom='off',
                                  top='off',
                                  labelbottom='on')
    info('Saving plot to ' + plot_fpath)
    matplotlib.pyplot.tight_layout()
    fig.savefig(plot_fpath, bbox_inches='tight')
    matplotlib.pyplot.close(fig)

    info('Done')
    info('-' * 70)
    return plot_fpath
def create_oncoprints_link(cnf, bcbio_structure, project_name=None):
    if is_us(): loc = exposing.us
    # elif is_uk(): loc = exposing.uk
    else:
        loc = exposing.local
        return None

    if not bcbio_structure.variant_callers:
        info('No varianting calling performed, not generating Oncoprints')
        return None
    clinical_report_caller = \
        bcbio_structure.variant_callers.get('vardict') or \
        bcbio_structure.variant_callers.get('vardict-java')
    if not clinical_report_caller:
        err('Warning: vardict is not in the variant callers list, this not generating Oncoprints')
        return None

    step_greetings('Creating Oncoprints link')
    zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS'
    if not isdir(zhongwu_data_query_dirpath):
        warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.')
        return None

    vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name)
    vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname)
    cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix)

    cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath

    samples = sorted(bcbio_structure.samples)
    cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir)
    study_name = re.sub('[\.\-:&]', '_', cnf.project_name)

    check_genome_resources(cnf)

    data_query_dirpath = join(loc.dirpath, 'DataQueryTool')

    data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt')
    info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt')
    altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath)
    if not altered_genes:
        err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.')
        return None

    print_info_txt(cnf, samples, info_fpath)

    data_ext_fpath = data_fpath.replace('/home/', '/users/')
    info_ext_fpath = info_fpath.replace('/home/', '/users/')

    # optional:
    data_symlink = join(data_query_dirpath, study_name + '.data.txt')
    info_symlink = join(data_query_dirpath, study_name + '.info.txt')
    (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink)
    (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink)

    properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties')
    add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath)

    genes = '%0D%0A'.join(altered_genes)
    data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?'
        'analysis=oncoprint&'
        'study={study_name}&'
        'gene={genes}&'
        'order=on&'
        'freq=50&'
        'nocheckgenes=true&'
        'submit=Submit'
        .format(**locals()))

    info()
    info('Information about study was added in Data Query Tool, URL is ' + data_query_url)
    return data_query_url
    def __init__(self,
                 cnf,
                 experiment_by_key,
                 parameters_info=None,
                 samples_data=None,
                 *args):
        BaseClinicalReporting.__init__(self, cnf, *args)

        self.experiment_by_key = experiment_by_key

        self.mutations_report = None
        self.mutations_plot_data = None
        self.venn_plot_data = None
        self.substitutions_plot_data = None
        self.sv_report = None
        self.actionable_genes_report = None
        self.seq2c_plot_data = None
        self.seq2c_report = None
        self.key_genes_report = None
        self.cov_plot_data = None
        self.mutations_by_experiment = OrderedDict()
        self.mutations_reports = dict()
        self.mutations_parameters = None
        self.seq2c_reports = defaultdict()

        self.sample_names = []
        for k, e in experiment_by_key.items():
            e.key = k
            import re
            e.sample.clinical_html = abspath(
                join(cnf.output_dir,
                     'report_' + str(get_group_num(k)) + '.html'))
            e.cnf.work_dir = cnf.work_dir
            if is_us:
                e.sample.clinical_html = re.sub('^/ngs/', '/gpfs/ngs/',
                                                e.sample.clinical_html)
                e.project_report_path = re.sub('^/ngs/', '/gpfs/ngs/',
                                               e.project_report_path)
            self.sample_names.append(e.sample.name)
        sample_infos = OrderedDict({
            k: get_sample_info(e.sample.name, e.sample.dirpath, samples_data)
            for k, e in experiment_by_key.iteritems()
        })
        sorted_sample_infos = sorted(
            sample_infos.items(),
            key=lambda x: (x[0][1], [x[1][j] for j in range(len(x[1]))]))
        sorted_experiments = OrderedDict()
        for k, v in sorted_sample_infos:
            sorted_experiments[k] = experiment_by_key[k]

        self.experiment_by_key = sorted_experiments
        # self.patient = self.merge_patients(self.infos)
        # bed_fpaths = set(experiment.target.bed_fpath for experiment in experiment_by_key.values() if experiment.target.bed_fpath)
        # bed_fnames = [basename(bed_fpath).split('.')[0] + '.bed' for bed_fpath in bed_fpaths]
        jbrowser_link = get_jbrowser_link(self.cnf.genome.name,
                                          self.sample_names)

        info('Preparing data...')
        # self.mut_by_key_by_exper = self.arrange_mutations({k: i.mutations for k, i in experiment_by_key.items()})
        for e in sorted_experiments.values():
            if e.mutations:
                self.mutations_by_experiment[e] = e.mutations
        group_nums = set(
            get_group_num(key) for key in self.experiment_by_key.keys())
        if self.mutations_by_experiment:
            self.mutations_report, self.venn_plot_data = self.make_mutations_report(
                self.mutations_by_experiment,
                jbrowser_link,
                samples_data=samples_data,
                parameters_info=parameters_info,
                create_venn_diagrams=True)
            info('Preparing data for each sample...')
            for num in group_nums:
                sample_mut_report, venn_plot_data = self.make_mutations_report(
                    self.mutations_by_experiment,
                    jbrowser_link,
                    samples_data=samples_data,
                    parameters_info=parameters_info,
                    create_venn_diagrams=True,
                    cur_group_num=num)
                self.mutations_reports[num] = (sample_mut_report,
                                               venn_plot_data)
            # self.mutations_plot_data = self.make_mutations_json(mutations_by_experiment)
            # self.substitutions_plot_data = self.make_substitutions_json(mutations_by_experiment)
        #self.actionable_genes_report = self.make_actionable_genes_report(experiment_by_key.values()[0].actionable_genes_dict)
        seq2c_events_by_experiment = {
            e: e.seq2c_events_by_gene
            for e in experiment_by_key.values() if e.seq2c_events_by_gene
        }
        if seq2c_events_by_experiment:
            for num in group_nums:
                seq2c_report = self.make_seq2c_report(
                    seq2c_events_by_experiment,
                    samples_data=samples_data,
                    cur_group_num=num)
                self.seq2c_reports[num] = seq2c_report