Exemple #1
0
    def find_germline_vcf(self, silent=False, caller=None):
        caller = caller or self.germline_caller
        if not caller:
            if not silent:
                warn(f'Batch {self.name} have no variant caler info assigned, skipping finding germline VCF')
            return
        assert caller

        # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019
        vcf_fpath_gz = adjust_path(join(self.parent_project.date_dir,
                f'{self.normals[0].name}-germline-{caller}.vcf.gz'))
        # in datestamp. bcbio before 1.1.6
        vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir,
                f'{self.normals[0].name}-germline-{caller}-annotated.vcf.gz'))

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}.vcf.gz: ' + vcf_fpath_gz)
            self.germline_vcf = vcf_fpath_gz

        elif isfile(vcf_old_fpath_gz):
            verify_file(vcf_old_fpath_gz, is_critical=True)
            if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz)
            self.germline_vcf = vcf_old_fpath_gz

        elif not silent:
            warn(f'Could not find germline variants files for batch {self.name}, caller {caller} neither as '
                 f'<date-dir>/<normal-name>-germline-{caller}.vcf.gz, nor as '
                 f'<date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)')
Exemple #2
0
    def find_bam(self, silent=False):
        name = self.get_name_for_files()

        to_try = [
            '-ready.cram',
            '-ready.bam',
            '-sort.bam',
        ]
        for ext in to_try:
            fpath = adjust_path(join(self.dirpath, name + ext))
            if verify_file(fpath):
                return fpath

        input_file = self.sample_info['files']
        if not isinstance(input_file, str):
            input_file = input_file[0]
        if isinstance(input_file, str) and input_file.endswith('.bam'):
            debug('Bcbio was run from BAM input')
            if not input_file.startswith('/'):
                input_file = abspath(join(self.bcbio_project.work_dir, input_file))
            if verify_file(input_file):
                debug('Using BAM file from input YAML ' + input_file)
                return input_file
            else:
                debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist')

        if not silent:
            warn('No BAM or CRAM file found for ' + self.name)
Exemple #3
0
def get_canonical_transcripts_ids(genome):
    short_genome = genome.split('-')[0]
    if short_genome.startswith('GRCh37'):
        short_genome = 'hg19'
    if short_genome.startswith('GRCh38'):
        short_genome = 'hg38'
    check_genome(short_genome)
    genome = short_genome

    canon_fpath = _get(join('{genome}', 'canon_transcripts_{genome}_ensembl.txt'), genome)
    replacement_fpath = _get('canon_cancer_replacement.txt')

    canon_fpath = verify_file(canon_fpath, description='Canonical transcripts path')
    replacement_fpath = verify_file(replacement_fpath, description='Canonical cancer transcripts replacement path')

    if not canon_fpath:
        return None
    with open(canon_fpath) as f:
        canon_tx_by_gname = dict(l.strip('\n').split('\t') for l in f)
    if replacement_fpath:
        with open(replacement_fpath) as f:
            for gname, tx_id in (l.strip('\n').split('\t') for l in f):
                canon_tx_by_gname[gname] = tx_id

    return canon_tx_by_gname
Exemple #4
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
Exemple #5
0
    def find_bam(self, silent=False):
        name = self.get_name_for_files()

        to_try = [
            '-ready.bam',
            '-ready.cram',
            '-sort.bam',
        ]
        for ext in to_try:
            fpath = adjust_path(join(self.dirpath, name + ext))
            if verify_file(fpath):
                return fpath

        input_file = self.sample_info['files']
        if not isinstance(input_file, str):
            input_file = input_file[0]
        if isinstance(input_file, str) and input_file.endswith('.bam'):
            debug('Bcbio was run from BAM input')
            if not input_file.startswith('/'):
                input_file = abspath(join(self.parent_project.work_dir, input_file))
            if verify_file(input_file):
                debug('Using BAM file from input YAML ' + input_file)
                return input_file
            else:
                debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist')

        if not silent:
            warn('No BAM or CRAM file found for ' + self.name)
Exemple #6
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = _load_yaml(fpath)
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Exemple #7
0
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
    try:
        dic = load_yaml(open(fpath))
    except Exception:
        err(format_exc())
        critical('Could not parse bcbio YAML ' + fpath)
    else:
        return dic
Exemple #8
0
    def find_qc_files(self, dst_dir, exclude_files=None, include_files=None):
        """
        Parses bcbio MultiQC file list and collects all QC files belonging to this batch

        :param dst_dir: destination directory where the QC files will be copied to
        :param exclude_files: not include files matching these patterns
        :param include_files: only include files matching these patterns
        :return: list of file paths copied into `new_mq_data_dir`
        """

        mq_dir = join(self.parent_project.date_dir, 'multiqc')
        mq_filelist = join(mq_dir, 'list_files_final.txt')
        verify_file(mq_filelist, is_critical=True)

        # Cromwell?
        cwl_targz = join(mq_dir, 'multiqc-inputs.tar.gz')
        tar_f_by_fp = dict()
        if isfile(cwl_targz):
            info(f'Found CWL MultiQC output {cwl_targz}, extracting required QC files from the archive')
            if cwl_targz:
                tar = tarfile.open(cwl_targz)
                for member in tar.getmembers():
                    rel_fp = member.name
                    if 'call-multiqc_summary/execution/qc/multiqc/' in rel_fp:
                        rel_fp = rel_fp.split('call-multiqc_summary/execution/qc/multiqc/')[1]
                    tar_f_by_fp[rel_fp] = tar.extractfile(member)

        qc_files_not_found = []
        qc_files_found = []
        with open(mq_filelist) as inp:
            for fp in [l.strip() for l in inp if l.strip()]:
                if fp == 'trimmed' or fp.endswith('/trimmed'):
                    continue  # back-compatibility with bcbio
                if exclude_files:
                    if isinstance(exclude_files, str):
                        exclude_files = [exclude_files]
                    if any(re.search(ptn, fp) for ptn in exclude_files):
                        continue
                if include_files:
                    if isinstance(include_files, str):
                        include_files = [include_files]
                    if not any(re.search(ptn, fp) for ptn in include_files):
                        continue

                new_fp = _extract_qc_file(fp, dst_dir, self.parent_project.final_dir, tar_f_by_fp)
                if not new_fp:
                    qc_files_not_found.append(fp)
                    continue
                else:
                    qc_files_found.append(new_fp)

        if qc_files_not_found:
            warn('-')
            warn(f'Some QC files from list {mq_filelist} were not found:' +
                ''.join('\n  ' + fpath for fpath in qc_files_not_found))
        return qc_files_found
Exemple #9
0
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
Exemple #10
0
def run(cmd, output_fpath=None, input_fpaths=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    if input_fpaths is not None:
        if isinstance(input_fpaths, str):
            input_fpaths = [input_fpaths]
        for fpath in input_fpaths:
            verify_file(fpath, is_critical=True)

    env = _get_env(env_vars)
    # info('env: ' + str(env))

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpaths):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpaths)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpaths)
        else:
            _try_run(cmd, output_fpath, input_fpaths)

    else:
        _try_run(cmd, None, input_fpaths)
Exemple #11
0
    def find_sv_vcf(self, silent=False, caller=False):
        caller = caller or self.sv_caller

        sv_prio   = join(self.tumors[0].dirpath, f'{self.name}-sv-prioritize-{caller}.vcf.gz')
        sv_unprio = join(self.tumors[0].dirpath, f'{self.name}-{caller}.vcf.gz')
        # CWL?
        sv_cwl_prio   = join(self.parent_project.date_dir,
                             f'{self.tumors[0].name}-{caller}-prioritized.vcf.gz')
        sv_cwl_unprio = join(self.parent_project.date_dir,
                             f'{self.tumors[0].name}-{caller}.vcf.gz')

        if isfile(sv_prio):
            verify_file(sv_prio, is_critical=True)
            if not silent: info(f'Found SV VCF in <tumor>/<batch>-sv-prioritize-{caller}.vcf.gz: ' + sv_prio)
            self.sv_vcf = sv_prio

        elif isfile(sv_unprio):
            verify_file(sv_unprio, is_critical=True)
            if not silent: info(f'Found SV VCF in <tumor>/<batch>-{caller}.vcf.gz: ' + sv_unprio)
            self.sv_vcf = sv_unprio

        elif isfile(sv_cwl_prio):
            verify_file(sv_cwl_prio, is_critical=True)
            if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}-prioritized.vcf.gz: ' + sv_cwl_prio)
            self.sv_cwl_prio = sv_cwl_prio

        elif isfile(sv_cwl_unprio):
            verify_file(sv_cwl_unprio, is_critical=True)
            if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}.vcf.gz: ' + sv_cwl_prio)
            self.sv_vcf = sv_cwl_unprio

        elif not silent:
            warn(f'Could not find SV VCF file for batch {self.name}, caller {caller} neither under sample folder as '
                 f'<tumor>/<batch>(-sv-prioritize)-{caller}.vcf.gz (conventional bcbio), '
                 f'nor in the project folder as project/<tumor>-{caller}(-prioritized).vcf.gz (CWL bcbio).')
Exemple #12
0
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}'

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
Exemple #13
0
 def find_multiqc_report(self):
     for fpath in [
         join(self.date_dir, BcbioProject.multiqc_report_name),
         join(self.date_dir, 'multiqc_postproc', 'multiqc_report.html'),
     ]:
         if verify_file(fpath, silent=True):
             return fpath
Exemple #14
0
def canon_transcript_per_gene(genome, only_principal=False, use_gene_id=False):
    """
    Returns a dict of lists: all most confident transcripts per gene according to APPRIS:
    first one in list is PRINCIPAL, the rest are ALTERNATIVE
    If only_principal=True, returns a dict of str, which just one transcript per gene (PRINCIPAL)
    """
    short_genome = genome.split('-')[0]
    if short_genome.startswith('GRCh37'):
        short_genome = 'hg19'
    if short_genome.startswith('GRCh38'):
        short_genome = 'hg38'
    check_genome(short_genome)

    fpath = _get_ensembl_file('appris_data.principal.txt', short_genome)
    fpath = verify_file(fpath,
                        is_critical=True,
                        description='APPRIS file path')

    princ_per_gene = dict()
    alt_per_gene = defaultdict(list)
    with open(fpath) as f:
        for l in f:
            gene, geneid, enst, ccds, label = l.strip().split('\t')
            if 'PRINCIPAL' in label:
                princ_per_gene[geneid if use_gene_id else gene] = enst
            elif not only_principal and 'ALTERNATIVE' in label:
                alt_per_gene[geneid if use_gene_id else gene].append(enst)

    if only_principal:
        return princ_per_gene
    else:
        return {g: [t] + alt_per_gene[g] for g, t in princ_per_gene.items()}
Exemple #15
0
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            if not silent:
                critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                         f'{self.bcbio_project.final_dir}. Please check consistency between the YAML '
                         f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: '
                         f'to every "description" value in YAML, there should be a corresponding folder with the '
                         f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                         f'from consideration, if you are sure that missing folders are expected.')
            else:
                return False
        self.var_dirpath = join(self.dirpath, BcbioProject.var_dir)

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
            else:
                if not silent: warn('Counts for ' + self.name + ' not found')
        else:
            if variantcallers_data:
                self._set_variant_files(variantcallers_data, ensemble=ensemble)
            else:
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
        return True
Exemple #16
0
 def find_mutation_file(self, passed=True, caller=None):
     caller = caller or self.bcbio_project.somatic_caller
     mut_fname = caller + '.' + vf.mut_file_ext
     mut_fpath = join(self.dirpath, BcbioProject.varfilter_dir, mut_fname)
     if passed:
         mut_fpath = add_suffix(mut_fpath, vf.mut_pass_suffix)
     return verify_file(mut_fpath, silent=True)
Exemple #17
0
 def find_coverage_stats(self):
     sname = self.name
     dirpath = self.dirpath
     if self.phenotype == 'germline':
         sname = re.sub(r'-germline$', '', sname)
         dirpath = re.sub(r'-germline$', '', dirpath)
     return verify_file(join(dirpath, 'qc', 'coverage', sname + '_coverage.bed'), silent=True)
Exemple #18
0
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' +
             ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [
        verify_file(fpath) for fpath in args
        if adjust_path(fpath) not in bam_by_sample
    ]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) +
             ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' +
                     ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
Exemple #19
0
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.rgid = self.name
        self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                     f'{self.parent_project.final_dir}. Please check consistency between the YAML '
                     f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: '
                     f'to every "description" value in YAML, there should be a corresponding folder with the '
                     f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                     f'from consideration, if you are sure that missing folders are expected.')

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
            else:
                if not silent: warn('Counts for ' + self.name + ' not found')
        else:
            if variantcallers_data:
                self._set_variant_callers(variantcallers_data, ensemble=ensemble)
            else:
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
Exemple #20
0
def main(bcbio_dir, bed, depth, threads=None, isdebug=True):
    snp_file = verify_file(bed)
    depth_cutoff = depth

    log.init(isdebug)

    try:
        import az
    except ImportError:
        parallel_cfg = ParallelCfg(threads=threads)
    else:
        sys_cfg = az.init_sys_cfg()
        parallel_cfg = ParallelCfg(
            scheduler=sys_cfg.get('scheduler'),
            queue=sys_cfg.get('queue'),
            resources=sys_cfg.get('resources'),
            threads=threads or sys_cfg.get('threads'),
            tag='clearup')

    log.info('Loading bcbio project from ' + bcbio_dir)
    log.info('-' * 70)
    proj = BcbioProject()
    proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup')
    log.info('Loaded ' + proj.final_dir)
    log_dir = safe_mkdir(join(proj.log_dir, 'clearup'))
    work_dir = safe_mkdir(join(proj.work_dir, 'clearup'))
    out_dir = safe_mkdir(join(proj.date_dir, 'clearup'))
    with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view:
        genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
def canon_transcript_per_gene(genome, only_principal=False, use_gene_id=False):
    """
    Returns a dict of lists: all most confident transcripts per gene according to APPRIS:
    first one in list is PRINCIPAL, the rest are ALTERNATIVE
    If only_principal=True, returns a dict of str, which just one transcript per gene (PRINCIPAL)
    """
    short_genome = genome.split('-')[0]
    if short_genome.startswith('GRCh37'):
        short_genome = 'hg19'
    if short_genome.startswith('GRCh38'):
        short_genome = 'hg38'
    check_genome(short_genome)

    fpath = _get_ensembl_file('appris_data.principal.txt', short_genome)
    fpath = verify_file(fpath, is_critical=True, description='APPRIS file path')

    princ_per_gene = dict()
    alt_per_gene = defaultdict(list)
    with open(fpath) as f:
        for l in f:
            gene, geneid, enst, ccds, label = l.strip().split('\t')
            if 'PRINCIPAL' in label:
                princ_per_gene[geneid if use_gene_id else gene] = enst
            elif not only_principal and 'ALTERNATIVE' in label:
                alt_per_gene[geneid if use_gene_id else gene].append(enst)

    if only_principal:
        return princ_per_gene
    else:
        return {g: [t] + alt_per_gene[g] for g, t in princ_per_gene.items()}
Exemple #22
0
def file_nonempty_check(output_fpath=None, input_fpaths=None):
    if output_fpath is None:
        return True
    ok = verify_file(output_fpath)
    if not ok:
        err(f'Did not find non-empty output file {output_fpath}')
    return ok
Exemple #23
0
def lift_over(fpath, from_genome, to_genome):
    chain_file = join(dirname(__file__), 'over.chain', f'{from_genome}To{to_genome.title()}.over.chain.gz')
    if not verify_file(chain_file):
        log.critical(f'Error: conversion from {from_genome} to {to_genome} is not supported!')
    out_fpath = add_suffix(fpath, to_genome)
    call_process.run(f'liftOver {fpath} {chain_file} {out_fpath} {out_fpath}.unMapped')
    return out_fpath
Exemple #24
0
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}'

    if not fai_fpath:
        check_genome(genome)
        fai_fpath = get_fai(genome)
    else:
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

    else:
        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
Exemple #25
0
 def find_coverage_stats(self):
     sname = self.name
     dirpath = self.dirpath
     if self.phenotype == 'germline':
         sname = re.sub(r'-germline$', '', sname)
         dirpath = re.sub(r'-germline$', '', dirpath)
     return verify_file(join(dirpath, 'qc', 'coverage', sname + '_coverage.bed'), silent=True)
Exemple #26
0
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    env = _get_env(env_vars)

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpath):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpath)
        else:
            _try_run(cmd, output_fpath, input_fpath)

    else:
        _try_run(cmd, None, input_fpath)
Exemple #27
0
def main(input_bed, output_file, output_features=False, genome=None,
         only_canonical=False, short=False, extended=False, high_confidence=False,
         ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False):
    """ Annotating BED file based on reference features annotations.
    """
    logger.init(is_debug_=is_debug)

    if not genome:
        raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome')

    if short:
        if extended:        raise click.BadParameter('--short and --extended can\'t be set both', param='extended')
        if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features')
    elif output_features or extended:
        extended = True
        short    = False

    if not verify_file(input_bed):
        click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed')
    input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}')

    if work_dir:
        work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0])
        safe_mkdir(work_dir)
        info(f'Created work directory {work_dir}')
    else:
        work_dir = mkdtemp('bed_annotate')
        debug('Created temporary work directory {work_dir}')

    input_bed = clean_bed(input_bed, work_dir)
    input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning')

    output_file = adjust_path(output_file)

    output_file = annotate(
        input_bed, output_file, work_dir, genome=genome,
        only_canonical=only_canonical, short=short, extended=extended,
        high_confidence=high_confidence, collapse_exons=collapse_exons,
        output_features=output_features,
        ambiguities_method=ambiguities_method, coding_only=coding_only,
        is_debug=is_debug)

    if not work_dir:
        debug(f'Removing work directory {work_dir}')
        shutil.rmtree(work_dir)

    info(f'Done, saved to {output_file}')
Exemple #28
0
 def _check_dir_not_empty(dirpath, description=None):
     assert verify_dir(dirpath, description=description), dirpath
     contents = [join(dirpath, fname) for fname in os.listdir(dirpath)
                 if not fname.startswith('.')]
     assert len(contents) >= 1, dirpath + ': ' + str(contents)
     assert all(verify_file(realpath(fpath), is_critical=True)
                for fpath in contents
                if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
Exemple #29
0
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical('Either of chr_order, fai_fpath, or genome build name must be specified')
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath)
    return output_bed_fpath
Exemple #30
0
    def set_project_level_dirs(self, bcbio_cnf, config_dir, project_name=None, final_dir=None, date_dir=None,
                               create_dirs=False, proc_name='postproc'):
        self.final_dir = self.set_final_dir(bcbio_cnf, config_dir, final_dir)
        if create_dirs: safe_mkdir(self.final_dir)

        self.project_name = self._set_project_name(self.final_dir, project_name)

        self.work_dir = abspath(join(self.final_dir, pardir, 'work'))
        if create_dirs: safe_mkdir(self.work_dir)

        self.date_dir = self._set_date_dir(bcbio_cnf, self.final_dir, date_dir, create_dir=create_dirs,
                                           silent=self.silent)
        self.log_dir = join(self.date_dir, 'log')
        self.postproc_log_dir = join(self.log_dir, proc_name)
        if create_dirs: safe_mkdir(self.postproc_log_dir)

        self.versions = verify_file(join(self.date_dir, 'data_versions.txt'), silent=True)
        self.programs = verify_file(join(self.date_dir, 'programs.txt'), silent=True)
Exemple #31
0
def _find_mutation_files(base_dir, passed=True, caller=None, is_germline=False):
    assert caller
    mut_fname = caller + '.' + vf.mut_file_ext
    mut_fpath = join(base_dir, mut_fname)
    single_mut_fpath = add_suffix(mut_fpath, vf.mut_single_suffix)
    paired_mut_fpath = add_suffix(mut_fpath, vf.mut_paired_suffix)
    fpaths = [mut_fpath, single_mut_fpath, paired_mut_fpath]
    if passed:
        fpaths = [add_suffix(p, vf.mut_pass_suffix) for p in fpaths]
    return [p for p in fpaths if verify_file(p, silent=True)]
Exemple #32
0
def get_dbsnp_multi_mafs(genome_cfg):
    if 'dbsnp_multi_mafs' not in genome_cfg:
        warn(
            'Warning: dbsnp_multi_mafs not provided in the system configuration file for the genome.'
        )
        return None
    return verify_file(
        genome_cfg['dbsnp_multi_mafs'],
        is_critical=True,
        description='dbSNP multi mafs file in system configuration file')
Exemple #33
0
def _get(relative_path, genome=None, is_critical=False):
    if genome:
        check_genome(genome)
    else:
        genome = ''
    relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if is_critical:
        return verify_file(path, is_critical=True)
    return path
Exemple #34
0
def _get(relative_path, genome=None, is_critical=False):
    if genome:
        check_genome(genome)
    else:
        genome = ''
    relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if is_critical:
        return verify_file(path, is_critical=True)
    return path
Exemple #35
0
def _read_list(reason, fpath):
    gene_d = {}
    fpath = verify_file(fpath,
                        description=reason + ' blacklist genes file',
                        is_critical=True)
    for l in iter_lines(fpath):
        fs = l.split('\t')
        gene_name = l.split('\t')[0]
        meta_info = l.split('\t')[1] if len(fs) == 2 else ''
        gene_d[gene_name] = meta_info
    return gene_d
Exemple #36
0
 def _check_dir_not_empty(dirpath, description=None):
     assert verify_dir(dirpath, description=description), dirpath
     contents = [
         join(dirpath, fname) for fname in os.listdir(dirpath)
         if not fname.startswith('.')
     ]
     assert len(contents) >= 1, dirpath + ': ' + str(contents)
     assert all(
         verify_file(realpath(fpath), is_critical=True)
         for fpath in contents
         if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
Exemple #37
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Exemple #38
0
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx)

    return output_bed_fpath
Exemple #39
0
    def find_somatic_vcf(self, silent=False, caller=None):
        caller = caller or self.somatic_caller
        if not caller:
            if not silent:
                warn(f'Batch {self.name} have no variant caler info assigned, skipping finding somatic VCF')
                return

        # in datestamp. cwl-bcbio writes there
        vcf_cwl_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '.vcf.gz'))
        # in datestamp. bcbio before 1.1.6
        vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '-annotated.vcf.gz'))
        # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019
        vcf_fpath_gz = adjust_path(join(self.tumors[0].dirpath, self.tumors[0].name + '-' + caller + '.vcf.gz'))

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info(f'Found somatic VCF in <final-dir>/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio): ' + vcf_fpath_gz)
            self.somatic_vcf = vcf_fpath_gz

        elif isfile(vcf_old_fpath_gz):
            verify_file(vcf_old_fpath_gz, is_critical=True)
            if not silent: info(f'Found somatic VCF in <date-dir>/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz)
            self.somatic_vcf = vcf_old_fpath_gz

        elif isfile(vcf_cwl_fpath_gz):
            verify_file(vcf_cwl_fpath_gz, is_critical=True)
            if not silent: info(f'Found somatic VCF in project/<batch>-{caller}.vcf.gz (CWL bcbio): ' + vcf_cwl_fpath_gz)
            self.somatic_vcf = vcf_cwl_fpath_gz

        elif not silent:
            warn(f'Could not find somatic variants files for batch {self.name}, caller {caller} neither as '
                 f'{self.parent_project.final_dir}/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio), nor as '
                 f'{self.parent_project.date_dir}/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6), nor as '
                 f'project/<batch>-{caller}.vcf.gz (CWL bcbio).')
Exemple #40
0
def merge_overlaps(work_dir, bed_fpath, distance=None):
    """Merge bed file intervals to avoid overlapping regions.
    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged')
    if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath):
        return output_fpath

    with file_transaction(work_dir, output_fpath) as tx:
        kwargs = dict(d=distance) if distance else dict()
        BedTool(bed_fpath).merge(**kwargs).saveas(tx)
    return output_fpath
Exemple #41
0
    def set_project_level_dirs(self, bcbio_cnf, config_dir, project_name=None, final_dir=None, date_dir=None,
                               create_dirs=False, proc_name='postproc'):
        self.final_dir = self.set_final_dir(bcbio_cnf, config_dir, final_dir)
        if create_dirs: safe_mkdir(self.final_dir)

        self.project_name = self._set_project_name(self.final_dir, project_name)

        self.work_dir = abspath(join(self.final_dir, pardir, 'work'))
        if create_dirs: safe_mkdir(self.work_dir)

        self.date_dir = self._set_date_dir(bcbio_cnf, self.final_dir, date_dir, create_dir=create_dirs,
                                           silent=self.silent)
        self.log_dir = join(self.date_dir, 'log')
        self.postproc_log_dir = join(self.log_dir, proc_name)
        if create_dirs: safe_mkdir(self.postproc_log_dir)

        self.var_dir = join(self.date_dir, BcbioProject.var_dir)
        self.raw_var_dir = join(self.var_dir, 'raw')
        self.expression_dir = join(self.date_dir, BcbioProject.expression_dir)

        self.versions = verify_file(join(self.date_dir, 'data_versions.txt'), silent=True)
        self.programs = verify_file(join(self.date_dir, 'programs.txt'), silent=True)
Exemple #42
0
def main(paths, output_dir, genome, depth):
    log.init(True)

    bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)]

    bcbio_projs = []
    dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)]
    if dirs:
        for d in dirs:
            proj = BcbioProject()
            proj.load_from_bcbio_dir(d, proc_name='clearup')
            bcbio_projs.append(proj)

    build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
Exemple #43
0
def merge_overlaps(work_dir, bed_fpath, distance=None):
    """Merge bed file intervals to avoid overlapping regions.
    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged')
    if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath):
        return output_fpath

    with file_transaction(work_dir, output_fpath) as tx:
        import pybedtools
        kwargs = dict(d=distance) if distance else dict()
        pybedtools.BedTool(bed_fpath).merge(**kwargs).saveas(tx)
    return output_fpath
Exemple #44
0
def verify_bed(bed, description='', is_critical=False, silent=False):
    if isinstance(bed, BedTool):
        return bed

    fpath = adjust_path(bed)
    if not verify_file(fpath, description, is_critical=is_critical, silent=silent):
        return None

    error = BedFile(fpath).checkformat()
    if error:
        fn = critical if is_critical else err
        fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) + '\n')
        return None

    return fpath
Exemple #45
0
def parse_mut_tp53(mut_fpath):
    mut_tp53 = set()
    if verify_file(mut_fpath):
        with open(mut_fpath) as f:
            for l in f:
                l = l.strip()
                if not l:
                    continue
                line = l.split('\t')
                if not line[19] or 'p.' not in line[19]:
                    continue
                prot = line[19].replace('p.', '')
                mut_tp53.add(prot)

    return mut_tp53
Exemple #46
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(
        **locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Exemple #47
0
def verify_bam(fpath, description='', is_critical=False, silent=False):
    if not verify_file(fpath, description, is_critical=is_critical, silent=silent):
        return None

    fpath = adjust_path(fpath)

    logfn = critical if is_critical else err
    if not fpath.endswith('.bam'):
        logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam '
            'extension. Please, make sure you pass proper file.')
        return None

    # TODO: check if binary

    return fpath
Exemple #48
0
def verify_bam(fpath, description='', is_critical=False, silent=False):
    if not verify_file(
            fpath, description, is_critical=is_critical, silent=silent):
        return None

    fpath = adjust_path(fpath)

    logfn = critical if is_critical else err
    if not fpath.endswith('.bam'):
        logfn('The file ' + fpath +
              ' is supposed to be BAM but does not have the .bam '
              'extension. Please, make sure you pass proper file.')
        return None

    # TODO: check if binary

    return fpath
Exemple #49
0
def verify_bed(bed, description='', is_critical=False, silent=False):
    import pybedtools
    if isinstance(bed, pybedtools.BedTool):
        return bed

    fpath = adjust_path(bed)
    if not verify_file(
            fpath, description, is_critical=is_critical, silent=silent):
        return None

    error = BedFile(fpath).checkformat()
    if error:
        fn = critical if is_critical else err
        fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) +
           '\n')
        return None

    return fpath
Exemple #50
0
def detect_run_info_in_config_dir(config_dir):
    run_info_fpaths_in_config = [
        abspath(join(config_dir, fname)) for fname in os.listdir(config_dir)
        if fname.startswith('run_info') and fname.endswith('.yaml')
    ]

    if len(run_info_fpaths_in_config) > 1:
        critical(
            'More than one YAML file containing run_info in name found in the config '
            'directory ' + config_dir + ': ' +
            ' '.join(run_info_fpaths_in_config))

    if len(run_info_fpaths_in_config) == 0:
        return None

    run_cnf = verify_file(run_info_fpaths_in_config[0], is_critical=True)
    info('Using run configuration from the config directory ' + run_cnf)
    return run_cnf
Exemple #51
0
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
Exemple #52
0
 def find_cnvkit_filt_file(self):
     return verify_file(join(self.date_dir, BcbioProject.cnv_dir,
                             add_suffix(BcbioProject.cnvkit_fname, 'filt')), silent=True)
Exemple #53
0
 def find_ngs_report(self, silent=False):
     return \
         verify_file(join(self.bcbio_project.date_dir, BcbioProject.reports_dir,
                          self.name + '.html'), silent=silent) or \
         verify_file(join(self.dirpath, BcbioProject.ngs_report_name,
                          BcbioProject.ngs_report_name + '.html'), silent=silent)
Exemple #54
0
 def find_cnvkit_file(self):
     return verify_file(join(self.date_dir, BcbioProject.cnv_dir, BcbioProject.cnvkit_fname), silent=True)
Exemple #55
0
 def find_seq2c_coverage(self):
     return verify_file(join(self.date_dir, BcbioProject.cnv_dir, 'seq2c-cov.tsv'), silent=True)
Exemple #56
0
    def find_vcf_file(self, batch_name, silent=False, caller=None):
        caller = caller or self.somatic_caller
        vcf_fname = batch_name + '-' + caller + '.vcf'
        annot_vcf_fname = batch_name + '-' + caller + '-annotated.vcf'

        vcf_annot_fpath_gz = adjust_path(join(self.date_dir, annot_vcf_fname + '.gz'))  # in datestamp
        var_raw_vcf_annot_fpath_gz = adjust_path(join(self.raw_var_dir, annot_vcf_fname + '.gz'))  # in datestamp/var/raw

        vcf_fpath_gz = adjust_path(join(self.date_dir, vcf_fname + '.gz'))  # in datestamp
        var_vcf_fpath_gz = adjust_path(join(self.var_dir, vcf_fname + '.gz'))  # in datestamp/var
        var_raw_vcf_fpath_gz = adjust_path(join(self.raw_var_dir, vcf_fname + '.gz'))  # in datestamp/var/raw

        vcf_fpath = adjust_path(join(self.date_dir, vcf_fname))  # in datestamp
        var_vcf_fpath = adjust_path(join(self.var_dir, vcf_fname))  # in datestamp/var
        var_raw_vcf_fpath = adjust_path(join(self.raw_var_dir, vcf_fname))  # in datestamp/var/raw

        if isfile(vcf_annot_fpath_gz):
            verify_file(vcf_annot_fpath_gz, is_critical=True)
            if not silent: info('Found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz)
            return vcf_annot_fpath_gz
        else:
            debug('Not found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz)

        if isfile(var_raw_vcf_annot_fpath_gz):
            verify_file(var_raw_vcf_annot_fpath_gz, is_critical=True)
            if not silent: info('Found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz)
            return var_raw_vcf_annot_fpath_gz
        else:
            debug('Not found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz)

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp dir ' + vcf_fpath_gz)
            return vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp dir ' + vcf_fpath_gz)

        if isfile(var_raw_vcf_fpath_gz):
            verify_file(var_raw_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz)
            return var_raw_vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz)

        if isfile(vcf_fpath):
            verify_file(vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp dir ' + vcf_fpath)
            return vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp dir ' + vcf_fpath)

        if isfile(var_raw_vcf_fpath):
            verify_file(var_raw_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath)
            return var_raw_vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath)

        if isfile(var_vcf_fpath_gz):
            verify_file(var_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp/var dir ' + var_vcf_fpath_gz)
            return var_vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp/var dir ' + var_vcf_fpath_gz)

        if isfile(var_vcf_fpath):
            verify_file(var_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath)
            return var_vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath)

        if not silent:
            warn('Warning: no VCF found for batch ' + batch_name + ', ' + caller + ', gzip or '
                'uncompressed version in the datestamp directory.')
        return None
Exemple #57
0
 def find_seq2c_file(self):
     return verify_file(join(self.date_dir, BcbioProject.cnv_dir, BcbioProject.seq2c_fname), silent=True) or \
            verify_file(join(self.date_dir, BcbioProject.cnv_dir, 'Seq2C.tsv'), silent=True)
def main():
    description = '''
The script writes all RefSeq features for requested genome build, and generates 3 files:
    all_features.{genome}.bed:
        Gene (protein_coding)
        Transcript (protein_coding and ncRNA)
        Exon (ncRNA)
        CDS (protein_coding)
    all_features.{genome}.canon.bed:
        The same, but taking canonical (or longest) transcripts only
    CDS.{genome}.bed
        CDS, canonical (or longest) transcripts only

Usage:
    ' + __file__ + ' hg19 [db.gtf]

     And db.gtf is either of the following:

     Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz
     1  pseudogene            gene        11869  14412  .  +  .  gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";
     1  processed_transcript  transcript  11869  14409  .  +  .  gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";
     ...

     RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz
     NC_000001.10    RefSeq          region       1       249250621       .       +       .       ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA
     NC_000001.10    BestRefSeq      gene         11874   14409           .       +       .       ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true
     NC_000001.10    BestRefSeq      transcript   11874   14409           .       +       .       ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2
     NC_000001.10    BestRefSeq      exon         11874   12227           .       +       .       ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2
     ...

     RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables)
     #hg19.knownGene.name  hg19.knownGene.chrom  hg19.knownGene.strand  hg19.knownGene.txStart  hg19.knownGene.txEnd  hg19.knownGene.exonCount  hg19.knownGene.exonStarts  hg19.knownGene.exonEnds  hg19.kgXref.geneSymbol
     uc001aaa.3	         chr1	               +	                  11873                   14409                 3                         11873,12612,13220,	      12227,12721,14409,	   DDX11L1
     ...

See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols'''

    options = [
        # (['--bam'], dict(dest='bam', help='path to the BAM file to analyse',)),
    ]

    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')

    genome_name = args[0]
    chrom_order = ref.get_chrom_order(genome_name)
    canonical_transcripts_ids = ref.get_canonical_transcripts_ids(genome_name)
    if len(args) > 1:
        input_fpath = verify_file(args[1])
    else:
        input_fpath = ba.get_refseq_gene(genome_name)

    output_dirpath = ba.get_refseq_dirpath()
    synonyms_fpath = ba.get_hgnc_gene_synonyms()
    not_approved_fpath = join(output_dirpath, 'not_approved.txt')

    info('Reading the features...')
    with open_gzipsafe(input_fpath) as inp:
        if input_fpath.endswith('.gtf') or input_fpath.endswith('.gtf.gz'):
            gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_dirpath, chrom_order)
        elif input_fpath.endswith('.gff3') or input_fpath.endswith('.gff3.gz'):
            gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_dirpath, chrom_order)
        else:
            gene_by_name_and_chrom = _proc_ucsc(inp, output_dirpath, chrom_order)

    if synonyms_fpath and DO_APPROVE:
        gene_by_name_and_chrom, not_approved_gene_names = _approve(gene_by_name_and_chrom, synonyms_fpath)

        info('')
        info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.')
        if not_approved_fpath:
            with open(not_approved_fpath, 'w') as f:
                f.write('#Searched as\tStatus\n')
                f.writelines((l + '\n' for l in not_approved_gene_names))
            info('Saved not approved to ' + not_approved_fpath)

    info('Found:')
    info('  ' + str(len(gene_by_name_and_chrom)) + ' genes')

    genes = gene_by_name_and_chrom.values()

    coding_genes = [g for g in genes if any(t.coding for t in g.transcripts)]
    coding_transcripts = [t for g in coding_genes for t in g.transcripts if t.coding]
    rna_genes = [g for g in genes if all(not t.coding for t in g.transcripts)]
    rna_transcripts = [t for g in genes for t in g.transcripts if not t.coding]
    mixed_genes = [g for g in genes if any(not t.coding for t in g.transcripts) and any(t.coding for t in g.transcripts)]
    info('  ' + str(len(coding_genes)) + ' coding genes')
    info('  ' + str(len(coding_transcripts)) + ' coding transcripts')
    info('  ' + str(len(rna_genes)) + ' RNA genes')
    info('  ' + str(len(rna_transcripts)) + ' RNA transcripts')
    info('  ' + str(len(mixed_genes)) + ' genes with both coding and RNA transcripts')
    for g in coding_genes:
        g.coding = True
        g.biotype = 'protein_coding'
    for g in rna_genes:
        g.coding = False
        g.biotype = 'RNA'

    info()
    # info('Choosing genes with exons...')
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]
    # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)]

    info('Choosing canonical...')
    canon_genes = choose_canonical(genes, canonical_transcripts_ids)

    info()
    info('Sorting and printing all regions...')
    all_features_fpath = ba.get_all_features(genome_name)
    write_all_features(genes, all_features_fpath, canon_only=False)
    all_features_fpath = bgzip_and_tabix(all_features_fpath, tabix_parameters='-p bed')

    info()
    info('Sorting and printing canonical regions...')
    canon_output_fpath = ba.get_all_features_canonical(genome_name, gzip=False)
    write_all_features(canon_genes, canon_output_fpath, canon_only=True)
    canon_output_fpath = bgzip_and_tabix(canon_output_fpath, tabix_parameters='-p bed')

    info()
    info('Sorting and printing canonical CDS...')
    cds_output_fpath = ba.get_cds(genome_name)
    write_all_features(canon_genes, cds_output_fpath, canon_only=True, cds_only=True)

    # info()
    # info('Sorting and printing CDS for Seq2C (unique transcript per gene)...')
    # seq2c_output_fpath = ga.get_seq2c_cds(genome_name)
    # write_all_features(canon_genes, seq2c_output_fpath, canon_only=True, cds_only=True, seq2c_cds=True)

    info()
    info('Saved all regions to\n   ' + all_features_fpath + '\n   ' + canon_output_fpath + '\n   ' + cds_output_fpath + '\n   ' + seq2c_output_fpath)
Exemple #59
0
    def find_vcf_file_from_sample_dir(sample, silent=False, caller=None):
        caller = caller or sample.bcbio_project.somatic_caller
        vcf_fname = sample.get_name_for_files() + '-' + caller + '.vcf'

        sample_var_dirpath = join(sample.dirpath, 'var')
        vcf_fpath_gz = adjust_path(join(sample.dirpath, vcf_fname + '.gz'))  # in var
        var_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, vcf_fname + '.gz'))  # in var
        var_raw_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname + '.gz'))  # in var
        vcf_fpath = adjust_path(join(sample.dirpath, vcf_fname))
        var_vcf_fpath = adjust_path(join(sample_var_dirpath, vcf_fname))  # in var
        var_raw_vcf_fpath = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname))  # in var

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF ' + vcf_fpath_gz)
            return vcf_fpath_gz
        else:
            debug('Not found VCF ' + vcf_fpath_gz)

        if isfile(var_vcf_fpath_gz):
            verify_file(var_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the var/ dir ' + var_vcf_fpath_gz)
            return var_vcf_fpath_gz
        else:
            debug('Not found VCF in the var/ dir ' + var_vcf_fpath_gz)

        if isfile(var_raw_vcf_fpath_gz):
            verify_file(var_raw_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz)
            return var_raw_vcf_fpath_gz
        else:
            debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz)

        if isfile(vcf_fpath):
            verify_file(vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF ' + vcf_fpath)
            return vcf_fpath
        else:
            debug('Not found uncompressed VCF ' + vcf_fpath)

        if isfile(var_vcf_fpath):
            verify_file(var_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the var/ dir ' + var_vcf_fpath)
            return var_vcf_fpath
        else:
            debug('Not found VCF in the var/ dir ' + var_vcf_fpath)

        if isfile(var_raw_vcf_fpath):
            verify_file(var_raw_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the var/raw/ dir ' + var_raw_vcf_fpath)
            return var_raw_vcf_fpath
        else:
            debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath)

        if not silent:
            warn('Warning: no VCF found for ' + sample.name + ' (' + caller + '), gzip or uncompressed version in and outside '
                'the var directory. Phenotype is ' + str(sample.phenotype))
        return None