Example #1
0
def _symlink_vcfs(callers, datestamp_var_dirpath):
    errory = []
    for caller in callers:
        info(caller.name)
        for sample in caller.samples:
            info(sample.name)

            filt_vcf_fpath = sample.find_filt_vcf_by_callername(caller.name)
            if not verify_file(filt_vcf_fpath):
                errory.append([sample.name, caller.name, filt_vcf_fpath])
            else:
                base_filt_fpath = filt_vcf_fpath[:
                                                 -3] if filt_vcf_fpath.endswith(
                                                     '.gz') else filt_vcf_fpath
                for fpath in [
                        base_filt_fpath + '.gz', base_filt_fpath + '.idx',
                        base_filt_fpath + '.gz.tbi'
                ]:
                    if verify_file(fpath, silent=True):
                        _symlink_to_dir(fpath, sample.dirpath)
                        # _symlink_to_dir(fpath, datestamp_var_dirpath)

            BCBioStructure.move_vcfs_to_var(sample)

    return errory
Example #2
0
def _correct_qualimap_insert_size_histogram(cnf, samples):
    """ replacing Qualimap insert size histogram with Picard one.
    """
    for s in samples:
        qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace(
            'raw_data_qualimapReport', 'raw_data')
        qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath)
        if exists(qualimap1_dirname):
            if not exists(qualimap2_dirname):
                shutil.move(qualimap1_dirname, qualimap2_dirname)
            else:
                shutil.rmtree(qualimap1_dirname)
        elif not exists(qualimap2_dirname):
            continue  # no data from both Qualimap v.1 and Qualimap v.2

        # if qualimap histogram exits and reuse_intermediate, skip
        if verify_file(s.qualimap_ins_size_hist_fpath,
                       silent=True) and cnf.reuse_intermediate:
            pass
        else:
            if verify_file(s.picard_ins_size_hist_txt_fpath):
                with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f:
                    one_line_to_stop = False
                    for line in picard_f:
                        if one_line_to_stop:
                            break
                        if line.startswith('## HISTOGRAM'):
                            one_line_to_stop = True

                    with file_transaction(
                            cnf.work_dir,
                            s.qualimap_ins_size_hist_fpath) as tx:
                        with open(tx, 'w') as qualimap_f:
                            for line in picard_f:
                                qualimap_f.write(line)
def main():
    info(' '.join(sys.argv))
    info()
    cnf, bcbio_structure = bcbio_summary_script_proc_params(
            'expression', BCBioStructure.expression_dir)

    step_greetings('Gene expression heatmaps summary for all samples')
    report_caption_names = ['Gene counts', 'Exon counts', 'Gene TPM', 'Isoform TPM']
    genes_dict, transcripts_dict = _get_gene_transcripts_id(cnf)
    for counts_fname, report_caption_name in zip(bcbio_structure.counts_names, report_caption_names):
        counts_fpath = join(bcbio_structure.expression_dirpath, counts_fname)
        if not verify_file(counts_fpath, silent=True):
            raw_counts_fpath = join(bcbio_structure.expression_dirpath, 'raw', 'combined.' + counts_fname.replace('.tsv', ''))
            info('Annotating ' + report_caption_name + ' from ' + raw_counts_fpath)
            annotate_gene_counts(cnf, raw_counts_fpath, counts_fpath, genes_dict)
        verify_file(counts_fpath, is_critical=True, description=counts_fname)

        isoforms_found = counts_fname == 'isoform.sf.tpm' and counts_fpath
        used_dict = transcripts_dict if isoforms_found else genes_dict
        report_fpath = join(safe_mkdir(join(bcbio_structure.expression_dirpath, 'html')),
                            counts_fname.replace('.tsv', '') + '.html')

        make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, used_dict, report_fpath,
                                      report_caption_name, keep_gene_names=isoforms_found)
    info('Done')
def index_vcf(cnf, sample_name, filt_vcf_fpath, caller_name=None):
    if cnf is None:
        global glob_cnf
        cnf = glob_cnf

    info()
    info(sample_name + ((', ' + caller_name) if caller_name else '') +
         ': indexing')

    # for fpath in [pass_vcf_fpath, filt_vcf_fpath]:
    #     if not cnf.reuse_intermediate and not verify_file(fpath, silent=True):
    #         err(fpath + ' does not exist - cannot IGV index')
    #     else:
    #         if cnf.reuse_intermediate and verify_file(fpath + '.idx', silent=True):
    #             info('Reusing existing ' + fpath + '.idx')
    #         else:
    #             igvtools_index(cnf, fpath)

    if not cnf.reuse_intermediate and not verify_file(filt_vcf_fpath,
                                                      silent=True):
        err(filt_vcf_fpath + ' does not exist - cannot gzip and tabix')
    else:
        if cnf.reuse_intermediate and verify_file(filt_vcf_fpath + '.gz', silent=True) \
                and verify_file(filt_vcf_fpath + '.gz.tbi', silent=True):
            info(filt_vcf_fpath + '.gz and .gz.tbi exist; reusing')
        else:
            bgzip_and_tabix(cnf, filt_vcf_fpath)
Example #5
0
def get_chr_lengths_from_seq(seq_fpath):
    chr_lengths = []

    if seq_fpath.endswith('.fai'):
        seq_fpath = splitext(seq_fpath)[0]

    if verify_file(seq_fpath + '.fai', silent=True):
        info('Reading genome index file (.fai) to get chromosome lengths')
        with open(adjust_path(seq_fpath + '.fai'), 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], line.split()[1]
                    chr_lengths.append((chrom, length))
    elif verify_file(seq_fpath, silent=True):
        info('Reading genome sequence (.fa) to get chromosome lengths')
        with open(adjust_path(seq_fpath), 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))
    else:
        critical('Can\'t find ' + seq_fpath + ' and ' + seq_fpath + '.fai')
    return chr_lengths
Example #6
0
def __final_seq2c_scripts(cnf, read_stats_fpath, combined_gene_depths_fpath, output_fpath):
    cov2lr = get_script_cmdline(cnf, 'perl', join('Seq2C', 'cov2lr.pl'), is_critical=True)
    cov2lr_output = join(cnf.work_dir, splitext(basename(output_fpath))[0] + '.cov2lr.tsv')

    controls = ''
    lr2gene_opt = ''
    if cnf.controls:
        controls = '-c ' + cnf.controls  # ':'.join([adjust_path(fpath) for fpath in cnf.controls.split(':')])
        lr2gene_opt = '-c'

    cmdline = '{cov2lr} -a {controls} {read_stats_fpath} {combined_gene_depths_fpath}'.format(**locals())
    call(cnf, cmdline, cov2lr_output, exit_on_error=False)
    info()

    if not verify_file(cov2lr_output):
        return None

    seq2c_opts = cnf.seq2c_opts or ''

    lr2gene = get_script_cmdline(cnf, 'perl', join('Seq2C', 'lr2gene.pl'), is_critical=True)
    cmdline = '{lr2gene} {lr2gene_opt} {seq2c_opts} {cov2lr_output}'.format(**locals())
    res = call(cnf, cmdline, output_fpath, exit_on_error=False)
    info()

    if not verify_file(output_fpath):
        return None

    return res
def extract_variant_from_bams(cnf, out_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before):
    padding = 500
    sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
    pos, ref, alt, variant_transcripts = variant['pos'], variant['ref'], variant['alt'], variant['transcripts']
    bam_prefix = None
    for transcript in variant_transcripts:
        transcript_name = sorted(variant_transcripts)[0]
        transcript_exons = transcripts[(transcript, chrom)]
        for idx, exon in enumerate(transcript_exons):
            if exon['start'] <= pos <= exon['stop']:
                start, end = exon['start'], exon['stop']
                bam_prefix = '{chrom}-{transcript_name}-{idx}-'.format(**locals())
        if bam_prefix:
            break
    if not bam_prefix:
        start, end = max(1, pos - padding), min(chr_length, pos + padding)
        ref_ = ref[:20]
        alt_ = alt[:20]
        bam_prefix = '{chrom}-{pos}-{ref_}-{alt_}-'.format(**locals())
    bams_by_sample = dict()
    for sample in samples:
        sample_name = sample.name.replace('-', '_')
        output_bam_fpath = join(out_dirpath, bam_prefix + '{sample_name}.bam'.format(**locals()))
        if output_bam_fpath in bams_created_before:
            continue
        if cnf.reuse_intermediate and verify_file(output_bam_fpath, silent=True):
            bams_by_sample[sample.name] = output_bam_fpath
        else:
            cmdline = '{sambamba} slice {sample.bam} {chrom}:{start}-{end} -o {output_bam_fpath}'.format(**locals())
            call(cnf, cmdline, silent=not cnf.verbose)
            if verify_file(output_bam_fpath, silent=True):
                cmdline = '{sambamba} index {output_bam_fpath}'.format(**locals())
                call(cnf, cmdline, silent=not cnf.verbose)
                bams_by_sample[sample.name] = output_bam_fpath
    return bams_by_sample
Example #8
0
def launch_bedcoverage_hist(work_dir,
                            bed,
                            bam,
                            chr_lengths_fpath,
                            bedcov_output_fpath=None,
                            bedtools='bedtools'):
    if not bedcov_output_fpath:
        bedcov_output_fpath = join(
            work_dir,
            splitext_plus(basename(bed))[0] + '__' +
            splitext_plus(basename(bam))[0] + '_bedcov_output.txt')

    if bam.endswith('bam'):
        bam = bam_to_bed_nocnf(bam, bedtools)
    verify_file(bam,
                is_critical=True,
                description='BAM to BED conversion result')

    v = bedtools_version(bedtools)
    if v and v >= 24:
        cmdline = '{bedtools} coverage -sorted -g {chr_lengths_fpath} -a {bed} -b {bam} -hist'.format(
            **locals())
    else:
        cmdline = '{bedtools} coverage -a {bam} -b {bed} -hist'.format(
            **locals())
    cmdline += ' > ' + bedcov_output_fpath
    info(cmdline)
    os.system(cmdline)
    res = verify_file(bedcov_output_fpath)
    if res:
        info('Done, saved to ' + bedcov_output_fpath)
    else:
        err('Error, result is non-existent or empty')
def find_fastq_pairs_by_sample_names(fastq_fpaths, sample_names):
    fastq_by_sn = OrderedDict()

    for sn in sample_names:
        sn_fastq_fpaths = sorted(
            [f for f in fastq_fpaths if basename(f).startswith(sn + '_R')])
        if len(sn_fastq_fpaths) == 0:
            err('Error: no fastq found for ' + sn)
            fastq_by_sn[sn] = None
        elif len(sn_fastq_fpaths) > 2:
            critical('Error: more than 2 fastq files starting with ' + sn +
                     '_R: ' + ', '.join(sn_fastq_fpaths))
        elif len(sn_fastq_fpaths) == 1:
            warn('Warning: only single fastq file is found for ' + sn +
                 '. Treating as single reads.')
            fastq_by_sn[sn] = [
                verify_file(sn_fastq_fpaths[0],
                            description='sn_fastq_fpaths[0] for ' + str(sn)),
                None
            ]
        else:
            fastq_by_sn[sn] = [
                verify_file(fpath,
                            description='fpath from sn_fastq_fpaths for ' +
                            str(sn)) for fpath in sn_fastq_fpaths
            ]

    return fastq_by_sn
def check_genome_resources(cnf):
    if cnf.genome is None:
        critical('Please, specify genome build (one of available in ' +
                 cnf.sys_cnf +
                 ') using the --genome option (e.g., --genome hg38).')

    if not cnf.genomes:
        critical('"genomes" section is not specified in system config ' +
                 cnf.sys_cnf)

    info('Genome: ' + str(cnf.genome.name))

    for key in cnf.genome.keys():
        if key != 'name' and isinstance(cnf.genome[key], basestring):
            cnf.genome[key] = adjust_system_path(cnf.genome[key])

            if not verify_obj_by_path(cnf.genome[key], key, silent=True):
                if not cnf.genome[key].endswith('.gz') and verify_file(
                        cnf.genome[key] + '.gz', silent=True):
                    gz_fpath = cnf.genome[key] + '.gz'
                    if verify_file(gz_fpath, silent=True):
                        cnf.genome[key] = gz_fpath

    if not cnf.genome.features or not cnf.genome.bed_annotation_features or not cnf.genome.cds:
        warn(
            'Warning: features and bed_annotation_features and cds in the system config ('
            + cnf.sys_cnf + ') must be specified.')

    if not cnf.transcripts_fpath:
        cnf.transcripts_fpath = cnf.transcripts_fpath or get_canonical_transcripts(
            cnf.genome.name, ensembl=True)
def submit_job(cnf, cmdline, job_name, wait_for_steps=None, threads=1,
               output_fpath=None, stdout_to_outputfile=True, run_on_chara=False, **kwargs):

    prefix = str(cnf.project_name) + '_'
    if job_name: prefix += job_name + '_'
    prefix += datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '_'
    f, done_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.done')
    f, error_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.error')
    if isfile(done_marker_fpath): os.remove(done_marker_fpath)
    if isfile(error_marker_fpath): os.remove(error_marker_fpath)
    job_id = basename(splitext(done_marker_fpath)[0])

    tx_output_fpath = None
    if output_fpath:
        if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            j = JobRunning(None, None, None, None, None, output_fpath=output_fpath, **kwargs)
            j.is_done = True
            return j
        if stdout_to_outputfile:
            tx_output_fpath = output_fpath + '.tx'
            if isfile(tx_output_fpath):
                os.remove(tx_output_fpath)
            cmdline += ' > ' + tx_output_fpath
        else:
            if isfile(output_fpath):
                os.remove(output_fpath)

    qsub = get_system_path(cnf, 'qsub', is_critical=True)
    bash = get_system_path(cnf, 'bash', is_critical=True)

    if cnf.log_dir:
        err_fpath = log_fpath = join(cnf.log_dir, job_id + '.log')
    else:
        fd, fpath = make_tmpfile(cnf, suffix=job_id + '.log', text=True)
        err_fpath = log_fpath = fpath

    queue = cnf.queue
    runner_script = adjust_system_path(cnf.qsub_runner)
    verify_file(runner_script, is_critical=True, description='qsub_runner')
    hold_jid_line = '-hold_jid ' + ','.join(wait_for_steps or ['_'])
    mem = threads * 15
    priority = 0
    if cnf.qsub_priority:
        priority = cnf.qsub_priority
    extra_qsub_opts = ''
    if run_on_chara and is_us():
        extra_qsub_opts += '-l h="chara|rask"'
    cmdline = cmdline.replace('"', '\\"').replace('\\\\"', '\\"')
    qsub_cmdline = (
        '{qsub} -pe smp {threads} {extra_qsub_opts} -S {bash} -q {queue} -p {priority} '
        '-j n -o {log_fpath} -e {err_fpath} {hold_jid_line} '
        '-N {job_id} {runner_script} {done_marker_fpath} {error_marker_fpath} "{cmdline}"'.format(**locals()))
    info('Submitting job ' + job_id)
    info(qsub_cmdline)
    job = JobRunning(job_id, log_fpath, qsub_cmdline, done_marker_fpath, error_marker_fpath,
                     output_fpath=output_fpath, tx_output_fpath=tx_output_fpath,
                     stdout_to_outputfile=stdout_to_outputfile, **kwargs)
    call(cnf, qsub_cmdline, silent=True)
    return job
Example #12
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Example #13
0
def calculate_coverage_use_grid(cnf, samples, output_dirpath):
    assert len(samples) > 0

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)

    chr_len_fpath = get_chr_len_fpath(cnf)
    jobs_to_wait = []

    for sample in samples:
        sample_output_dirpath = join(output_dirpath, sample.name)
        safe_mkdir(sample_output_dirpath)

    for chrom in chromosomes:
        info('Processing chromosome ' + chrom)
        avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz')
        sample_output_fpaths = [
            join(output_dirpath, sample.name, chrom + '.txt.gz')
            for sample in samples
        ]

        sample_names = ','.join(sample.name for sample in samples)
        chrom_bams = []

        for sample in samples:
            if not verify_file(sample.bam):
                err('BAM for ' + sample.name + ' is not exist!')
                continue
            output_bam_fpath = join(
                cnf.work_dir,
                basename(sample.name) + '_' + str(chrom) + '.bam')
            cmdline = '{sambamba} slice {sample.bam} {chrom}'.format(
                **locals())
            call(cnf, cmdline, output_fpath=output_bam_fpath)
            if verify_file(output_bam_fpath):
                chrom_bams.append(output_bam_fpath)

        bam_fpaths = ','.join(chrom_bams)

        if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \
                all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths):
            info(avg_cov_output_fpath + ' exists, reusing')
        else:
            j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths,
                                   sample_names, output_dirpath, chr_len_fpath)
            if j and not j.is_done:
                jobs_to_wait.append(j)
            info()

        if len(jobs_to_wait) >= cnf.threads:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
            jobs_to_wait = []
        elif not jobs_to_wait:
            info('No jobs to submit.')
    if jobs_to_wait:
        wait_for_jobs(cnf, jobs_to_wait)
def determine_sys_cnf(opts):
    if 'sys_cnf' in opts.__dict__ and opts.sys_cnf:
        return verify_file(opts.sys_cnf, is_critical=True)
    else:
        opts.__dict__['sys_cnf'] = verify_file(detect_sys_cnf_by_location(),
                                               is_critical=True)

    debug('Using system configuration ' + opts.sys_cnf)
    return opts.sys_cnf
Example #15
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath), Loader=Loader)
    except:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Example #16
0
def add_project_files_to_jbrowse(cnf, bcbio_structure):
    genome = cnf.genome.name
    jbrowse_data_path, _, _ = set_folders(genome)

    jbrowse_dirpath = join(jbrowse_data_path, 'tracks')
    jbrowse_project_dirpath = join(jbrowse_dirpath,
                                   bcbio_structure.project_name)

    safe_mkdir(jbrowse_project_dirpath)
    jbrowse_tracks_fpath = join(jbrowse_data_path, 'tracks.conf')

    vcf_fpath_by_sample = None
    caller = bcbio_structure.variant_callers.get('vardict') or \
             bcbio_structure.variant_callers.get('vardict-java')
    if caller:
        vcf_fpath_by_sample = caller.get_filt_vcf_by_sample()

    for sample in bcbio_structure.samples:
        if sample.bam:
            index_bam(cnf, sample.bam, use_grid=True)

    for sample in bcbio_structure.samples:
        if all(isfile(join(jbrowse_project_dirpath, sample.name + ext)) for ext in ['.bam', '.bam.bai', '.vcf.gz', '.vcf.gz.tbi', '.bigwig'])\
                and check_tracks_in_configs(sample.name, bcbio_structure.project_name, jbrowse_tracks_fpath, vcf_fpath_by_sample):
            info(sample.name + ' was exported to jBrowse previously.')
            continue
        vcf_link = None
        if vcf_fpath_by_sample:
            vcf_fpath = vcf_fpath_by_sample[
                sample.name] if sample.name in vcf_fpath_by_sample else None
            if vcf_fpath and verify_file(vcf_fpath):
                vcf_link = create_jbrowse_symlink(genome,
                                                  bcbio_structure.project_name,
                                                  sample.name, vcf_fpath)
                if not verify_file(vcf_fpath + '.tbi'):
                    cmdline = '{tabix} {vcf_fpath}'.format(**locals())
                    call(cnf, cmdline, exit_on_error=False)
                create_jbrowse_symlink(genome, bcbio_structure.project_name,
                                       sample.name, vcf_fpath + '.tbi')

        if sample.bam:
            bam_link = create_jbrowse_symlink(genome,
                                              bcbio_structure.project_name,
                                              sample.name, sample.bam)
            create_jbrowse_symlink(genome, bcbio_structure.project_name,
                                   sample.name, sample.bam + '.bai')
            bigwig_link = create_jbrowse_symlink(
                genome, bcbio_structure.project_name, sample.name,
                splitext(sample.bam)[0] + '.bigwig')
            print_sample_tracks_info(sample.name, bcbio_structure.project_name,
                                     trunc_symlink(bam_link),
                                     trunc_symlink(bigwig_link),
                                     trunc_symlink(vcf_link),
                                     jbrowse_tracks_fpath)
Example #17
0
def save_regions_to_bed(cnf, regions, bed_fpath, save_original_fields=False):
    if isfile(bed_fpath):
        if cnf.reuse_intermediate:
            verify_file(bed_fpath, is_critical=True)
            return bed_fpath
        else:
            os.remove(bed_fpath)

    with file_transaction(cnf.work_dir, bed_fpath) as tx_fpath:
        save_regions_to_bed_nocnf(regions, tx_fpath, save_original_fields)
    return bed_fpath
Example #18
0
def main(args):
    if len(args) < 2:
        critical('Usage: ' + __file__ +
                 ' InputRootDirectory OutputRootDirectory [Build=hg38]')
        sys.exit(1)

    inp_root = adjust_path(args[0])
    out_root = adjust_path(args[1])

    build = 'hg38'
    if len(args) >= 3:
        build = args[2]

    chain_fpath = chains[build.lower()]

    for inp_dirpath, subdirs, files in os.walk(inp_root):
        for fname in files:
            if fname == 'sample1-cn_mops.bed':
                pass
            if fname.endswith('.bed'):
                inp_fpath = adjust_path(join(inp_dirpath, fname))
                print inp_fpath + ': ' + str(
                    count_bed_cols(inp_fpath)) + ' columns'

                out_dirpath = adjust_path(
                    join(out_root, relpath(inp_dirpath, inp_root)))
                safe_mkdir(out_dirpath)
                out_fpath = adjust_path(join(out_dirpath, fname))
                unlifted_fpath = adjust_path(
                    join(out_dirpath, fname + '.unlifted'))

                cmdline = ''

                with open(inp_fpath) as f:
                    fs = f.readline().split('\t')
                try:
                    int(fs[6])
                    int(fs[7])
                except:
                    info('Cutting ' + inp_fpath)
                    cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; '

                cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"'
                cmdline = cmdline.format(**locals())
                info(cmdline)
                os.system(cmdline)
                verify_file(out_fpath)
                if isfile(unlifted_fpath):
                    if getsize(unlifted_fpath) <= 0:
                        os.remove(unlifted_fpath)
                    else:
                        err('Some records were unlifted and saved to ' +
                            unlifted_fpath)
def determine_run_cnf(opts, is_wgs=False, is_targetseq=False):
    if opts.run_cnf:
        opts.run_cnf = adjust_path(opts.run_cnf)
    elif is_wgs:
        opts.run_cnf = defaults['run_cnf_wgs']
    elif is_targetseq:
        opts.run_cnf = defaults['run_cnf_deep_seq']
    else:
        opts.run_cnf = defaults['run_cnf_exome_seq']

    verify_file(opts.run_cnf, is_critical=True)
    debug('Using run configuration ' + opts.run_cnf)
    return opts.run_cnf
Example #20
0
def process_one(cnf, output_dir, bam_fpath, features_bed,
                features_no_genes_bed):
    sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam)
    sample.l_fpath = cnf.l_fpath
    sample.r_fpath = cnf.r_fpath

    # if not sample.bam and sample.l_fpath and sample.r_fpath:
    #     sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath))

    info('Using alignment ' + sample.bam)

    if not bam_fpath:
        critical(sample.name + ': BAM file is required.')

    target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None
    bam_fpath = verify_file(sample.bam, is_critical=True)
    index_bam(cnf, bam_fpath)

    gene_keys_list = None
    if cnf.prep_bed is not False:
        info('Preparing the BED file.')
        features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds(
            cnf, features_bed, target_bed)

        gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)
    else:
        info('The BED file is ready, skipping preparing.')
        gene_keys_set, gene_keys_list, _, _, _ = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)

    picard_ins_size_hist(cnf, sample, bam_fpath, output_dir)

    avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports(
        cnf, output_dir, sample, bam_fpath, features_bed,
        features_no_genes_bed, target_bed, gene_keys_list)

    # #if cnf.extended:
    # try:
    #     info('Generating flagged regions report...')
    #     flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom)
    #     if not flagged_report:
    #         err('Flagged regions report was not generated')
    #         err()
    # except:
    #     err(format_exc())

    return reports
Example #21
0
def _make_tarqc_html_report(cnf,
                            output_dir,
                            samples,
                            bed_fpath=None,
                            tag_by_sample=None):
    header_storage = get_header_metric_storage(
        cnf.coverage_reports.depth_thresholds,
        is_wgs=bed_fpath is not None,
        padding=cnf.coverage_reports.padding)

    jsons_by_sample = {
        s.name: s.targetcov_json_fpath
        for s in samples if verify_file(s.targetcov_json_fpath)
    }
    htmls_by_sample = dict(
    )  #{s.name: s.targetcov_html_fpath for s in samples if verify_file(s.targetcov_html_fpath)}

    if not jsons_by_sample:
        return None, None, None

    targqc_full_report = FullReport.construct_from_sample_report_jsons(
        samples, output_dir, jsons_by_sample, htmls_by_sample)

    for sample_report in targqc_full_report.sample_reports:
        if tag_by_sample:
            sample_report.set_project_tag(
                tag_by_sample[sample_report.sample.name])
        if verify_file(sample_report.sample.qualimap_html_fpath):
            url = relpath(sample_report.sample.qualimap_html_fpath, output_dir)
            r = sample_report.find_record(sample_report.records, 'Qualimap')
            if r:
                r.url = url
            else:
                sample_report.add_record(metric_name='Qualimap',
                                         value='Qualimap',
                                         url=url,
                                         silent=True)

    _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report)

    txt_fpath = targqc_full_report.save_txt(
        join(output_dir, BCBioStructure.targqc_name + '.txt'))
    tsv_fpath = targqc_full_report.save_tsv(
        join(output_dir, BCBioStructure.targqc_name + '.tsv'))
    html_fpath = targqc_full_report.save_html(
        cnf, join(output_dir, BCBioStructure.targqc_name + '.html'), 'TargQC')

    return txt_fpath, tsv_fpath, html_fpath
Example #22
0
def _intersect_with_tricky_regions(cnf, selected_bed_fpath, sample):
    info()
    info('Detecting problematic regions for ' + sample)

    bed_filenames = [fn + '.bed.gz' for fn in tricky_regions_fnames_d.keys()]

    merged_bed_fpaths = [
        join(cnf.genome.tricky_regions, 'merged', bed_filename)
        for bed_filename in bed_filenames
    ]

    info('Intersecting BED ' + selected_bed_fpath +
         ' using BED files with tricky regions')

    intersection_fpath = join(
        cnf.work_dir,
        splitext_plus(basename(selected_bed_fpath))[0] +
        '_tricky_vcf_bed.intersect')
    if not cnf.reuse_intermediate or not verify_file(
            intersection_fpath, silent=True, is_critical=False):
        bedtools = get_system_path(cnf, 'bedtools')
        cmdline = bedtools + ' intersect -header -a ' + selected_bed_fpath + ' -b ' + ' '.join(
            merged_bed_fpaths) + ' -wo -filenames'
        call(cnf,
             cmdline,
             output_fpath=intersection_fpath,
             exit_on_error=False)

    return intersection_fpath
def _get_gene_transcripts_id(cnf):
    genes_dict = dict()
    transcripts_dict = dict()

    if not cnf.genome.all_transcripts:
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' was not found! Heatmaps cannot be created.')
    if not verify_file(cnf.genome.all_transcripts):
        critical('File with transcripts and genes ID ' + cnf.genome.name + ' at ' + cnf.genome.all_transcripts + ' was not found! Heatmaps cannot be created.')

    info('Getting transcripts ID and genes ID from ' + cnf.genome.all_transcripts)

    with open_gzipsafe(cnf.genome.all_transcripts) as f:
        for i, l in enumerate(f):
            if l.startswith('#'):
                continue
            chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t')
            if feature != 'transcript':
                continue
            try:
                _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:]))
                                  for t in props_line.split(';') if t.strip())
            except ValueError:
                sys.stderr.write(format_exc())
                sys.stderr.write(l)

            gene_symbol = _rm_quotes(_prop_dict['gene_name'])
            gene_id = _rm_quotes(_prop_dict['gene_id'])
            transcript_id = _rm_quotes(_prop_dict['transcript_id'])
            #gene = Gene(gene_symbol, chrom=chrom, gene_id=gene_id, transcript_id=transcript_id)
            genes_dict[gene_id] = gene_symbol
            transcripts_dict[transcript_id] = gene_symbol
    return genes_dict, transcripts_dict
def leave_main_sample(cnf, vcf_fpath, samplename):
    index = get_sample_column_index(vcf_fpath, samplename)
    if index is None:
        return vcf_fpath

    # def _f1(rec):
    #     rec.samples = [sample_name]
    #     return rec
    #
    info('Keeping SAMPLE only for the first sample (' + samplename + ')')
    # vcf_fpath = iterate_vcf(cnf, vcf_fpath, _f1, suffix=sample_name)
    # out_fpath = extract_sample(cnf, vcf_fpath, sample_name)
    # info()

    def _f(line, i):
        if line and (line.startswith('#CHROM') or line[0] != '#'):
            ts = line.split('\t')
            return '\t'.join(ts[:9] + [ts[9 + index]])
        return line
    vcf_fpath = iterate_file(cnf, vcf_fpath, _f, suffix='1sm')

    if not verify_file(vcf_fpath):
        err('Error: leave_first_sample didnt generate output file.')
        return None

    return vcf_fpath
def join_vcf2txt_results(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath):
    info('WGS; running vcftxt separately for each sample to save memory.')
    vcf2txt_outputs_by_vcf_fpath = OrderedDict()
    for vcf_fpath in vcf_fpath_by_sample.values():
        sample_output_fpath = add_suffix(vcf2txt_out_fpath,
                                         splitext(basename(vcf_fpath))[0])
        vcf2txt_outputs_by_vcf_fpath[vcf_fpath] = sample_output_fpath
        info()

    info('Joining vcf2txt ouputs... (' +
         str(len(vcf2txt_outputs_by_vcf_fpath)) + ' out of ' +
         str(len(vcf_fpath_by_sample)) + ' successful), ' + 'writing to ' +
         vcf2txt_out_fpath)
    with file_transaction(cnf.work_dir, vcf2txt_out_fpath) as tx:
        with open(tx, 'w') as out:
            for i, (vcf_fpath, sample_output_fpath) in enumerate(
                    vcf2txt_outputs_by_vcf_fpath.items()):
                info('   Reading ' + sample_output_fpath)
                with open(sample_output_fpath) as inp:
                    for j, l in enumerate(inp):
                        if j == 0 and i != 0:
                            continue
                        out.write(l)
    if verify_file(vcf2txt_out_fpath):
        info('Saved ' + vcf2txt_out_fpath)
        return vcf2txt_out_fpath
    else:
        return None
def verify_vcf(vcf_fpath, silent=False, is_critical=False):
    if not verify_file(vcf_fpath, silent=silent, is_critical=is_critical):
        return None
    debug('File ' + vcf_fpath + ' exists and not empty')
    vcf = open_gzipsafe(vcf_fpath)
    debug('File ' + vcf_fpath + ' opened')
    l = next(vcf, None)
    if l is None:
        (critical if is_critical else err)('Error: cannot read the VCF file ' + vcf_fpath)
        return None
    if not l.startswith('##fileformat=VCF'):
        (critical if is_critical else err)('Error: VCF must start with ##fileformat=VCF ' + vcf_fpath)
        return None

    try:
        reader = vcf_parser.Reader(vcf)
    except:
        err('Error: cannot open the VCF file ' + vcf_fpath)
        if is_critical: raise
    else:
        debug('File ' + vcf_fpath + ' opened as VCF')
        try:
            rec = next(reader)
        except IndexError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('IndexError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except ValueError:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('ValueError parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        except StopIteration:
            debug('No records in the VCF file ' + vcf_fpath)
            if not silent:
                warn('VCF file ' + vcf_fpath + ' has no records.')
            return vcf_fpath
        except:
            err('Error: cannot parse records in the VCF file ' + vcf_fpath)
            debug('Other error parsing VCF file ' + vcf_fpath)
            if is_critical: raise
        else:
            debug('A record was read from the VCF file ' + vcf_fpath)
            return vcf_fpath
        # f = open_gzipsafe(output_fpath)
        # l = f.readline()
        # if 'Cannot allocate memory' in l:
        #     f.close()
        #     f = open_gzipsafe(output_fpath)
        #     contents = f.read()
        #     if not silent:
        #         if is_critical:
        #             critical('SnpSift failed with memory issue:\n' + contents)
        #         else:
        #             err('SnpSift failed with memory issue:\n' + contents)
        #             return None
        #     f.close()
        #     return None
        # return output_fpath
    finally:
        vcf.close()
Example #27
0
def _correct_qualimap_genome_results(cnf, samples):
    """ fixing java.lang.Double.parseDouble error on entries like "6,082.49"
    """
    for s in samples:
        if verify_file(s.qualimap_genome_results_fpath):
            correction_is_needed = False
            with open(s.qualimap_genome_results_fpath, 'r') as f:
                content = f.readlines()
                metrics_started = False
                for line in content:
                    if ">> Reference" in line:
                        metrics_started = True
                    if metrics_started:
                        if line.find(',') != -1:
                            correction_is_needed = True
                            break
            if correction_is_needed:
                with open(s.qualimap_genome_results_fpath, 'w') as f:
                    metrics_started = False
                    for line in content:
                        if ">> Reference" in line:
                            metrics_started = True
                        if metrics_started:
                            if line.find(',') != -1:
                                line = line.replace(',', '')
                        f.write(line)
def split_bams(cnf, samples, vcf_fpath):
    variants_by_chrom = parse_variants(vcf_fpath)
    temp_output_dirpath = join(cnf.work_dir, 'temp')
    safe_mkdir(temp_output_dirpath)
    info('Splitting BAM files...')
    for chrom, variants in variants_by_chrom.iteritems():
        chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
        chr_lengths_dict = dict((c, l) for (c, l) in chr_lengths)
        chr_length = chr_lengths_dict[chrom]
        transcripts = get_transcipts_with_exons_from_features(verify_file(cnf.features, is_critical=True), cur_chrom=chrom)
        bams_created_before = []
        bams_by_sample = defaultdict(list)
        info('Extracting variant coverage for all samples for ' + chrom + ', ' + str(len(variants)) + ' variants')
        for variant in variants:
            variant_bams_by_sample = extract_variant_from_bams(cnf, temp_output_dirpath,
                 transcripts, chr_length, samples, chrom, variant, bams_created_before)
            bams_created_before.extend(variant_bams_by_sample.values())
            for sample_name, bam_fpath in variant_bams_by_sample.iteritems():
                bams_by_sample[sample_name].append(bam_fpath)
        chrom = chrom.replace('chr', '')
        info()
        for sample_name, bam_fpaths in bams_by_sample.iteritems():
            info('Making combined BAMs for chr' + chrom + ' for sample ' + sample_name)
            bam_fname = '{chrom}-{sample_name}.bam'.format(**locals())
            temp_combined_bam_fpath = join(temp_output_dirpath, bam_fname)
            combined_bam_fpath = join(cnf.output_dir, bam_fname)
            generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath)
            info()
    info('Removing BAM files...')
    shutil.rmtree(temp_output_dirpath, ignore_errors=True)
def make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample):
    c = cnf.variant_filtering
    min_freq = c.act_min_freq

    cmdline = \
        '-r 1.0 -R 1.0 -P {c.filt_p_mean} -Q {c.filt_q_mean} -D {c.filt_depth} -V {c.min_vd} ' \
        '-f {min_freq} -p {c.min_p_mean} -q {c.min_q_mean} ' \
        '-M {c.min_mq} -o {c.signal_noise} -L'.format(**locals())

    if c.bias:
        cmdline += ' -b '

    dbsnp_multi_mafs = cnf.genome.dbsnp_multi_mafs
    if dbsnp_multi_mafs and verify_file(dbsnp_multi_mafs):
        cmdline += ' -A ' + dbsnp_multi_mafs
    else:
        cmdline += ' -A ""'

    if c.amplicon_based:
        cmdline += ' -a '

    # corr_vcf_fpath_by_sample = dict()
    # for sn, vcf_fpath in vcf_fpath_by_sample.items():
    #     ungz = vcf_fpath
    #     if vcf_fpath.endswith('.gz'):
    #         ungz = splitext(vcf_fpath)[0]
    #         call(cnf, 'gunzip ' + vcf_fpath, output_fpath=ungz)
    #     corr_vcf_fpath_by_sample[sn] = ungz

    cmdline += ' ' + ' '.join(vcf_fpath_by_sample.values())
    return cmdline
Example #30
0
def _get_depth_for_each_variant(cnf, var_by_site, clipped_gz_vcf_fpath,
                                bed_fpath, bam_fpath):
    # http://www.1000genomes.org/faq/what-depth-coverage-your-phase1-variants
    # bedtools intersect -a oncomine.vcf -b Exons.az_key.bed -header > oncomine.az_key.vcf
    # /opt/az/local/tabix/tabix-0.2.6/bgzip oncomine.az_key.vcf
    # /opt/az/local/tabix/tabix-0.2.6/tabix -h -p vcf oncomine.az_key.vcf.gz
    # samtools view -b TRF004223.sorted.bam -L Exons.az_key.bed | bedtools genomecov -ibam stdin -bg > coverage.bg
    # bedtools intersect -a oncomine.az_key.vcf.gz -b coverage.bg -wa | cut -f1,2,4,5,8,11,12,13,14 > oncomine.az_key.depth_numbers.vcf

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bedtools = get_system_path(cnf, 'bedtools')

    info()
    info('Depth of coverage for regions in BED ' + bed_fpath)
    cov_bg = join(cnf.work_dir, 'coverage.bg')
    cmdline = '{sambamba} view -f bam -t {cnf.threads} -L {bed_fpath} {bam_fpath} | {bedtools} genomecov -ibam stdin -bg'.format(
        **locals())
    call(cnf, cmdline, output_fpath=cov_bg, exit_on_error=False)

    info()
    info('Intersecting depth regions with VCF ' + clipped_gz_vcf_fpath)
    vcf_depth_numbers_fpath = join(cnf.work_dir, 'vcf_bg.intersect')
    if not cnf.reuse_intermediate or not verify_file(
            vcf_depth_numbers_fpath, silent=True, is_critical=False):
        cmdline = '{bedtools} intersect -a {clipped_gz_vcf_fpath} -b {cov_bg} -wao'.format(
            **locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vcf_depth_numbers_fpath,
                   exit_on_error=False)
    # if res != oncomine_depth_numbers_fpath:
    #     info()
    #     info('Trying with uncompressed VCF')
    #     cmdline = 'gunzip {vcf_fpath} -c | {bedtools} intersect -a - -b {cov_bg} -wao | cut -f1,2,4,5,8,11,12,13,14,15'.format(**locals())
    #     call(cnf, cmdline, output_fpath=oncomine_depth_numbers_fpath)

    depths_per_var = defaultdict(list)
    with open(vcf_depth_numbers_fpath) as f:
        for l in f:
            # 1,2,4,5,8,11,12,13,14,15,16,17,18,19,20
            # c,p,r,a,f,ch,st,en,ge,ex,st,ft,bt,de,ov
            fs = l.replace('\n', '').split('\t')
            chrom, pos, _, ref, alt = fs[:5]
            depth, overlap = fs[-2:]
            var = var_by_site.get((chrom, pos, ref, alt))
            if var and depth != '.':
                depth, overlap = int(depth), int(overlap)
                for i in range(overlap):
                    depths_per_var[(chrom, pos, ref, alt)].append(depth)

    # Getting average depth of coverage of each variant (exactly for those parts that were in BED)
    depth_by_var = {
        var: (sum(depths) / len(depths)) if len(depths) != 0 else None
        for var, depths in depths_per_var.iteritems()
    }

    return depth_by_var