Exemple #1
0
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.rgid = self.name
        self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                     f'{self.parent_project.final_dir}. Please check consistency between the YAML '
                     f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: '
                     f'to every "description" value in YAML, there should be a corresponding folder with the '
                     f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                     f'from consideration, if you are sure that missing folders are expected.')

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
            else:
                if not silent: warn('Counts for ' + self.name + ' not found')
        else:
            if variantcallers_data:
                self._set_variant_callers(variantcallers_data, ensemble=ensemble)
            else:
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
Exemple #2
0
def _add_to_ngb(work_dir, project_name, bam_by_sample, genome_build, bed_file,
                p_view):
    if is_us() or is_uk():
        try:
            from az.ngb import add_bcbio_project_to_ngb, add_data_to_ngb, add_file_to_ngb
        except ImportError:
            log.warn(
                'If you want to, install NGS Reporting with `conda install -v vladsaveliev ngs_reporting`'
            )
        else:
            log.info('Exposing project to NGB...')
            try:
                dataset = project_name + '_Fingerprints'
                add_data_to_ngb(work_dir,
                                p_view,
                                bam_by_sample,
                                dict(),
                                dataset,
                                bed_file=bed_file,
                                genome=genome_build)
                add_file_to_ngb(work_dir,
                                get_dbsnp(genome_build),
                                genome_build,
                                dataset,
                                dataset,
                                skip_if_added=True)
            except Exception:
                traceback.print_exc()
                log.err('Error: cannot export to NGB')
            log.info('*' * 70)
Exemple #3
0
    def find_somatic_vcf(self, silent=False, caller=None):
        caller = caller or self.somatic_caller
        if not caller:
            if not silent:
                warn(f'Batch {self.name} have no variant caler info assigned, skipping finding somatic VCF')
                return

        # in datestamp. cwl-bcbio writes there
        vcf_cwl_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '.vcf.gz'))
        # in datestamp. bcbio before 1.1.6
        vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '-annotated.vcf.gz'))
        # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019
        vcf_fpath_gz = adjust_path(join(self.tumors[0].dirpath, self.tumors[0].name + '-' + caller + '.vcf.gz'))

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info(f'Found somatic VCF in <final-dir>/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio): ' + vcf_fpath_gz)
            self.somatic_vcf = vcf_fpath_gz

        elif isfile(vcf_old_fpath_gz):
            verify_file(vcf_old_fpath_gz, is_critical=True)
            if not silent: info(f'Found somatic VCF in <date-dir>/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz)
            self.somatic_vcf = vcf_old_fpath_gz

        elif isfile(vcf_cwl_fpath_gz):
            verify_file(vcf_cwl_fpath_gz, is_critical=True)
            if not silent: info(f'Found somatic VCF in project/<batch>-{caller}.vcf.gz (CWL bcbio): ' + vcf_cwl_fpath_gz)
            self.somatic_vcf = vcf_cwl_fpath_gz

        elif not silent:
            warn(f'Could not find somatic variants files for batch {self.name}, caller {caller} neither as '
                 f'{self.parent_project.final_dir}/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio), nor as '
                 f'{self.parent_project.date_dir}/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6), nor as '
                 f'project/<batch>-{caller}.vcf.gz (CWL bcbio).')
Exemple #4
0
def run_simple(cmd, silent=False):
    """Run the provided command, logging details and checking for errors.
    """
    # cmd = _normalize_cmd_args(cmd)
    if not silent:
        warn(' '.join(str(x) for x in cmd) if not isinstance(cmd, str) else cmd)
    subprocess.check_call(cmd, shell=True, executable=find_bash())
Exemple #5
0
    def find_germline_vcf(self, silent=False, caller=None):
        caller = caller or self.germline_caller
        if not caller:
            if not silent:
                warn(f'Batch {self.name} have no variant caler info assigned, skipping finding germline VCF')
            return
        assert caller

        # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019
        vcf_fpath_gz = adjust_path(join(self.parent_project.date_dir,
                f'{self.normals[0].name}-germline-{caller}.vcf.gz'))
        # in datestamp. bcbio before 1.1.6
        vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir,
                f'{self.normals[0].name}-germline-{caller}-annotated.vcf.gz'))

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}.vcf.gz: ' + vcf_fpath_gz)
            self.germline_vcf = vcf_fpath_gz

        elif isfile(vcf_old_fpath_gz):
            verify_file(vcf_old_fpath_gz, is_critical=True)
            if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz)
            self.germline_vcf = vcf_old_fpath_gz

        elif not silent:
            warn(f'Could not find germline variants files for batch {self.name}, caller {caller} neither as '
                 f'<date-dir>/<normal-name>-germline-{caller}.vcf.gz, nor as '
                 f'<date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)')
Exemple #6
0
    def find_sv_vcf(self, silent=False, caller=False):
        caller = caller or self.sv_caller

        sv_prio   = join(self.tumors[0].dirpath, f'{self.name}-sv-prioritize-{caller}.vcf.gz')
        sv_unprio = join(self.tumors[0].dirpath, f'{self.name}-{caller}.vcf.gz')
        # CWL?
        sv_cwl_prio   = join(self.parent_project.date_dir,
                             f'{self.tumors[0].name}-{caller}-prioritized.vcf.gz')
        sv_cwl_unprio = join(self.parent_project.date_dir,
                             f'{self.tumors[0].name}-{caller}.vcf.gz')

        if isfile(sv_prio):
            verify_file(sv_prio, is_critical=True)
            if not silent: info(f'Found SV VCF in <tumor>/<batch>-sv-prioritize-{caller}.vcf.gz: ' + sv_prio)
            self.sv_vcf = sv_prio

        elif isfile(sv_unprio):
            verify_file(sv_unprio, is_critical=True)
            if not silent: info(f'Found SV VCF in <tumor>/<batch>-{caller}.vcf.gz: ' + sv_unprio)
            self.sv_vcf = sv_unprio

        elif isfile(sv_cwl_prio):
            verify_file(sv_cwl_prio, is_critical=True)
            if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}-prioritized.vcf.gz: ' + sv_cwl_prio)
            self.sv_cwl_prio = sv_cwl_prio

        elif isfile(sv_cwl_unprio):
            verify_file(sv_cwl_unprio, is_critical=True)
            if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}.vcf.gz: ' + sv_cwl_prio)
            self.sv_vcf = sv_cwl_unprio

        elif not silent:
            warn(f'Could not find SV VCF file for batch {self.name}, caller {caller} neither under sample folder as '
                 f'<tumor>/<batch>(-sv-prioritize)-{caller}.vcf.gz (conventional bcbio), '
                 f'nor in the project folder as project/<tumor>-{caller}(-prioritized).vcf.gz (CWL bcbio).')
Exemple #7
0
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            if not silent:
                critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                         f'{self.bcbio_project.final_dir}. Please check consistency between the YAML '
                         f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: '
                         f'to every "description" value in YAML, there should be a corresponding folder with the '
                         f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                         f'from consideration, if you are sure that missing folders are expected.')
            else:
                return False
        self.var_dirpath = join(self.dirpath, BcbioProject.var_dir)

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
            else:
                if not silent: warn('Counts for ' + self.name + ' not found')
        else:
            if variantcallers_data:
                self._set_variant_files(variantcallers_data, ensemble=ensemble)
            else:
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
        return True
Exemple #8
0
    def find_bam(self, silent=False):
        name = self.get_name_for_files()

        to_try = [
            '-ready.cram',
            '-ready.bam',
            '-sort.bam',
        ]
        for ext in to_try:
            fpath = adjust_path(join(self.dirpath, name + ext))
            if verify_file(fpath):
                return fpath

        input_file = self.sample_info['files']
        if not isinstance(input_file, str):
            input_file = input_file[0]
        if isinstance(input_file, str) and input_file.endswith('.bam'):
            debug('Bcbio was run from BAM input')
            if not input_file.startswith('/'):
                input_file = abspath(join(self.bcbio_project.work_dir, input_file))
            if verify_file(input_file):
                debug('Using BAM file from input YAML ' + input_file)
                return input_file
            else:
                debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist')

        if not silent:
            warn('No BAM or CRAM file found for ' + self.name)
Exemple #9
0
    def find_bam(self, silent=False):
        name = self.get_name_for_files()

        to_try = [
            '-ready.bam',
            '-ready.cram',
            '-sort.bam',
        ]
        for ext in to_try:
            fpath = adjust_path(join(self.dirpath, name + ext))
            if verify_file(fpath):
                return fpath

        input_file = self.sample_info['files']
        if not isinstance(input_file, str):
            input_file = input_file[0]
        if isinstance(input_file, str) and input_file.endswith('.bam'):
            debug('Bcbio was run from BAM input')
            if not input_file.startswith('/'):
                input_file = abspath(join(self.parent_project.work_dir, input_file))
            if verify_file(input_file):
                debug('Using BAM file from input YAML ' + input_file)
                return input_file
            else:
                debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist')

        if not silent:
            warn('No BAM or CRAM file found for ' + self.name)
Exemple #10
0
def pair_dragen_directories(paths):
    # DRAGEN tumor/normal and normal directories are paired on the basis of the normal sample
    # name.
    #
    # Tumor and normal sample names are extracted from the BAM header. Specifically, the BAM
    # sample name is retrieved from the 'SM' (sample) field of the '@RG' (read group) header line.
    #
    # Tumor or normal identity of a sample is inferred from the BAM filename: if a BAM filename
    # contains the '_tumor.bam' suffix then it and the sample name is set as the tumor, otherwise
    # set as the normal sample.
    #
    # The subject identifier is from the DRAGEN output directory name.
    #
    # Assumes a one-to-one pairing for DRAGEN tumor/normal and normal output directories i.e. no
    # multiple tumor/normal runs to a single normal run.

    # Sort paths by normal sample name so that normal and tumor/normal are placed together
    paths_sorted = dict()
    for path in paths:
        dir_type = 'tumor_normal_run' if is_dragen_tumor_normal_directory(
            path) else 'normal_run'
        samples = get_samples_from_dragen_dir_bams(path)
        # Ensure we have found normal names
        if 'normal' not in samples:
            critical(
                f'Could not find normal sample name for DRAGEN directory {path}'
            )
        # Sort by normal sample name, add path, subject ID to stored data
        sample_normal = samples['normal']
        if sample_normal not in paths_sorted:
            paths_sorted[sample_normal] = dict()
        assert dir_type not in paths_sorted[sample_normal]
        paths_sorted[sample_normal][dir_type] = samples
        paths_sorted[sample_normal][dir_type]['path'] = path
        paths_sorted[sample_normal][dir_type][
            'prefix'] = get_dragen_output_prefix(path)
        paths_sorted[sample_normal][dir_type][
            'subject_id'] = get_subject_id_from_dragen_dir(path)

    # Differentiated paired and unpaired paths
    paths_unpaired = list()
    paths_paired = list()
    for paths in paths_sorted.values():
        if 'normal_run' in paths and 'tumor_normal_run' in paths:
            # Ensure we have collected only one subject id for this set of inputs
            assert len({d['subject_id'] for d in paths.values()}) == 1
            paths['subject_id'] = paths['normal_run']['subject_id']
            paths_paired.append(paths)
        else:
            for dir_type, data in paths.items():
                paths_unpaired.append((dir_type, data['path']))
    # Emit warning for unpaired paths
    if paths_unpaired:
        paths_unpaired_strs = list()
        for dir_type, path in paths_unpaired:
            paths_unpaired_strs.append(f'{dir_type}: {path}')
        paths_unpaired_str = '\n\t'.join(paths_unpaired_strs)
        warn(f'could not pair DRAGEN directories:\n\t{paths_unpaired_str}')
    return paths_paired
Exemple #11
0
def run_simple(cmd, env_vars=None, silent=False):
    """Run the provided command, logging details and checking for errors.
    """
    env = _get_env(env_vars)
    cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd)
    if not silent:
        warn(' '.join(str(x) for x in cmd) if not isinstance(cmd, str) else cmd)
    subprocess.check_call(cmd, shell=shell_arg, executable=executable_arg, env=env)
Exemple #12
0
    def find_vcf_file_from_sample_dir(sample, silent=False, caller=None):
        caller = caller or sample.bcbio_project.somatic_caller
        vcf_fname = sample.get_name_for_files() + '-' + caller + '.vcf'

        sample_var_dirpath = join(sample.dirpath, 'var')
        vcf_fpath_gz = adjust_path(join(sample.dirpath, vcf_fname + '.gz'))  # in var
        var_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, vcf_fname + '.gz'))  # in var
        var_raw_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname + '.gz'))  # in var
        vcf_fpath = adjust_path(join(sample.dirpath, vcf_fname))
        var_vcf_fpath = adjust_path(join(sample_var_dirpath, vcf_fname))  # in var
        var_raw_vcf_fpath = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname))  # in var

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF ' + vcf_fpath_gz)
            return vcf_fpath_gz
        else:
            debug('Not found VCF ' + vcf_fpath_gz)

        if isfile(var_vcf_fpath_gz):
            verify_file(var_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the var/ dir ' + var_vcf_fpath_gz)
            return var_vcf_fpath_gz
        else:
            debug('Not found VCF in the var/ dir ' + var_vcf_fpath_gz)

        if isfile(var_raw_vcf_fpath_gz):
            verify_file(var_raw_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz)
            return var_raw_vcf_fpath_gz
        else:
            debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz)

        if isfile(vcf_fpath):
            verify_file(vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF ' + vcf_fpath)
            return vcf_fpath
        else:
            debug('Not found uncompressed VCF ' + vcf_fpath)

        if isfile(var_vcf_fpath):
            verify_file(var_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the var/ dir ' + var_vcf_fpath)
            return var_vcf_fpath
        else:
            debug('Not found VCF in the var/ dir ' + var_vcf_fpath)

        if isfile(var_raw_vcf_fpath):
            verify_file(var_raw_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the var/raw/ dir ' + var_raw_vcf_fpath)
            return var_raw_vcf_fpath
        else:
            debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath)

        if not silent:
            warn('Warning: no VCF found for ' + sample.name + ' (' + caller + '), gzip or uncompressed version in and outside '
                'the var directory. Phenotype is ' + str(sample.phenotype))
        return None
Exemple #13
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
Exemple #14
0
def tmpdir():
    dirpath = make_tmpdir()
    try:
        yield dirpath
    finally:
        try:
            shutil.rmtree(dirpath)
        except OSError:
            warn('Warning: cannot clean up temporary dir ' + dirpath)
Exemple #15
0
def tmpdir():
    dirpath = make_tmpdir()
    try:
        yield dirpath
    finally:
        try:
            shutil.rmtree(dirpath)
        except OSError:
            warn('Warning: cannot clean up temporary dir ' + dirpath)
Exemple #16
0
    def find_qc_files(self, dst_dir, exclude_files=None, include_files=None):
        """
        Parses bcbio MultiQC file list and collects all QC files belonging to this batch

        :param dst_dir: destination directory where the QC files will be copied to
        :param exclude_files: not include files matching these patterns
        :param include_files: only include files matching these patterns
        :return: list of file paths copied into `new_mq_data_dir`
        """

        mq_dir = join(self.parent_project.date_dir, 'multiqc')
        mq_filelist = join(mq_dir, 'list_files_final.txt')
        verify_file(mq_filelist, is_critical=True)

        # Cromwell?
        cwl_targz = join(mq_dir, 'multiqc-inputs.tar.gz')
        tar_f_by_fp = dict()
        if isfile(cwl_targz):
            info(f'Found CWL MultiQC output {cwl_targz}, extracting required QC files from the archive')
            if cwl_targz:
                tar = tarfile.open(cwl_targz)
                for member in tar.getmembers():
                    rel_fp = member.name
                    if 'call-multiqc_summary/execution/qc/multiqc/' in rel_fp:
                        rel_fp = rel_fp.split('call-multiqc_summary/execution/qc/multiqc/')[1]
                    tar_f_by_fp[rel_fp] = tar.extractfile(member)

        qc_files_not_found = []
        qc_files_found = []
        with open(mq_filelist) as inp:
            for fp in [l.strip() for l in inp if l.strip()]:
                if fp == 'trimmed' or fp.endswith('/trimmed'):
                    continue  # back-compatibility with bcbio
                if exclude_files:
                    if isinstance(exclude_files, str):
                        exclude_files = [exclude_files]
                    if any(re.search(ptn, fp) for ptn in exclude_files):
                        continue
                if include_files:
                    if isinstance(include_files, str):
                        include_files = [include_files]
                    if not any(re.search(ptn, fp) for ptn in include_files):
                        continue

                new_fp = _extract_qc_file(fp, dst_dir, self.parent_project.final_dir, tar_f_by_fp)
                if not new_fp:
                    qc_files_not_found.append(fp)
                    continue
                else:
                    qc_files_found.append(new_fp)

        if qc_files_not_found:
            warn('-')
            warn(f'Some QC files from list {mq_filelist} were not found:' +
                ''.join('\n  ' + fpath for fpath in qc_files_not_found))
        return qc_files_found
Exemple #17
0
def get_dbsnp_multi_mafs(genome_cfg):
    if 'dbsnp_multi_mafs' not in genome_cfg:
        warn(
            'Warning: dbsnp_multi_mafs not provided in the system configuration file for the genome.'
        )
        return None
    return verify_file(
        genome_cfg['dbsnp_multi_mafs'],
        is_critical=True,
        description='dbSNP multi mafs file in system configuration file')
Exemple #18
0
    def update_batches(self, samples, silent=False):
        batch_by_name = {bn: BcbioBatch(name=bn, parent_project=self)
                         for bn in list(set([b for s in samples for b in s.batch_names]))}

        for sample in samples:
            for bn in sample.batch_names:
                batch_by_name[bn].name = bn
                sample.batches.append(batch_by_name[bn])
                if sample.phenotype == 'normal':
                    batch_by_name[bn].add_normal(sample)
                else:
                    batch_by_name[bn].add_tumor(sample)

        # Removing batches that do not have matching tumor samples
        batch_by_name = {bn: b for bn, b in batch_by_name.items() if b.tumors}

        # for batch in batch_by_name.values():
        #     if batch.normal and not batch.tumor:
        #         if not silent: info('Batch ' + batch.name + ' contains only normal, treating sample ' + batch.normal.name + ' as tumor')
        #         batch.normal.phenotype = 'tumor'
        #         batch.normal.batch = batch
        #         batch.tumor = batch.normal
        #         batch.normal = None

        # setting up batch properties
        for b in batch_by_name.values():
            for t in b.tumors:
                t.normal_matches = b.normals

        # setting variant caller names for batches
        for b in batch_by_name.values():
            if b.tumors[0].somatic_caller is None:
                if not silent:
                    warn(f'Sample {b.name} doesn\'t have somatic variant callers info, skip assinging '
                         f'variant caller to batch {b.name}')
            else:
                b.somatic_caller = b.tumors[0].somatic_caller
            if b.normals:
                if b.normals[0].germline_caller is None:
                    if not silent:
                        warn(f'Sample {b.name} doesn\'t have germline variant callers info, '
                             f'skip assinging germline variant caller to batch {b.name}')
                else:
                    b.germline_caller = b.normals[0].germline_caller

        # finding vcfs
        if not self.is_rnaseq:
            for b in batch_by_name.values():
                if b.tumors:
                    b.find_somatic_vcf(silent=silent)
                    b.find_sv_vcf(silent=silent)
                if b.normals:
                    b.find_germline_vcf(silent=silent)

        return batch_by_name
Exemple #19
0
def workdir(cnf):
    if cnf.work_dir:
        verify_dir(cnf.work_dir, is_critical=True)
        yield cnf.work_dir
    else:
        cnf.work_dir = make_tmpdir()
        yield cnf.work_dir
        try:
            shutil.rmtree(cnf.work_dir)
        except OSError:
            warn('Warning: cannot clean up temporary dir ' + cnf.work_dir)
Exemple #20
0
def workdir(cnf):
    if cnf.work_dir:
        verify_dir(cnf.work_dir, is_critical=True)
        yield cnf.work_dir
    else:
        cnf.work_dir = make_tmpdir()
        yield cnf.work_dir
        try:
            shutil.rmtree(cnf.work_dir)
        except OSError:
            warn('Warning: cannot clean up temporary dir ' + cnf.work_dir)
Exemple #21
0
    def calc_genomic_bp_pos(self):
        genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord(
            self.trx, self.bp_offset)
        if genomic_coord is None:
            logger.critical(
                f'  Error: could not convert transcript {id} offest {genomic_coord} to genomic coordinate'
            )
            return False

        if genomic_coord == -1:
            logger.warn(
                f'  Fusion in {self} takes the whole transcript {self.trx.id}. That\'s suspicious, so we are skipping it.'
            )
            return False

        self.bp_genomic_pos = genomic_coord
        self.bp_is_in_intron = is_in_intron
        return True
Exemple #22
0
def load_filt_cfg(filt_cnf_fpath=None,
                  target_type=None,
                  vardict_min_freq=None,
                  is_wgs=False):
    """
    Specify either target_type, or vardict_min_freq and is_wgs
    """
    if not filt_cnf_fpath:
        if not target_type:
            if vardict_min_freq is not None:
                if vardict_min_freq <= 0.005:
                    info('Filtering config: min_allele_fraction=' +
                         str(vardict_min_freq) + ' which is less 0.005, '
                         'setting config for panel')
                    target_type = 'panel'
                elif is_wgs is None:  # coverage interval is not defined
                    warn(
                        'Coverage interval is not defined, skipping variant filtering'
                    )
                    return None
                elif is_wgs is True:
                    target_type = 'genome'
                    info('Filtering config: setting config for genome')
                else:
                    target_type = 'exome'
                    info('Filtering config: min_allele_fraction=' +
                         str(vardict_min_freq) +
                         ' which is higher than 0.005, '
                         'setting config for exome')
            else:
                target_type = 'exome'
                info(
                    'Neither min freq not filt config was provided, using settings for exome'
                )
        assert target_type in filt_cnf_fpaths, \
            'filt_cnf_fpath=' + str(filt_cnf_fpath) + '; ' + str(target_type) + ' not in ' + str(filt_cnf_fpaths.keys())
        filt_cnf_fpath = filt_cnf_fpaths[target_type]
    d = load_yaml_config(filt_cnf_fpath)
    if d.get('variant_filtering') and isinstance(d.get('variant_filtering'),
                                                 dict):
        d = d.get('variant_filtering', dict())
    d = fill_dict_from_defaults(d, filt_info_defaults)
    d['filt_cnf_fpath'] = filt_cnf_fpath
    return d
Exemple #23
0
def main():
    options = [
        (['-g', '--genome'],
         dict(
             dest='genome',
             help='Genome build. Accepted values: ' +
             ', '.join(ebl.SUPPORTED_GENOMES),
         )),
        (['-c', '--canonical'],
         dict(
             dest='canonical',
             action='store_true',
             help='Use canonical only',
         )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        logger.critical(
            'Error: please, specify genome build name with -g (e.g. `-g hg19`)'
        )
    genome = opts.genome

    logger.debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        logger.critical('Genome ' + genome + ' is not supported. Supported: ' +
                        ', '.join(ebl.SUPPORTED_GENOMES))

    logger.warn('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(
        lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    if opts.canonical:
        features_bed = features_bed.filter(
            ebl.get_only_canonical_filter(genome))

    logger.warn('Saving CDS regions...')
    output_fpath = adjust_path(
        join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    logger.warn('Done, saved to ' + output_fpath)
Exemple #24
0
    def set_samples(self, bcbio_cnf, include_samples=None, exclude_samples=None):
        debug('Reading sample details...')
        exclude_samples = [s.replace('.', '_') for s in exclude_samples] if exclude_samples else None
        include_samples = [s.replace('.', '_') for s in include_samples] if include_samples else None

        # First pass - just to get extra batch IDs that we need to include to have batches consistent
        extra_batches = set()
        all_sample_names = set()
        all_batch_names = set()
        if include_samples:
            for sample_info in bcbio_cnf['details']:
                sname, batch_names = BcbioSample.parse_sample_ids(sample_info)
                all_sample_names.add(sname)
                all_batch_names |= set(batch_names)
                if sname in include_samples:
                    for b in batch_names:
                        if b not in (include_samples or []) and b not in (exclude_samples or []):
                            extra_batches.add(b)

        # Second pass - including/excluding, and creating BcbioSample objects
        for sample_info in bcbio_cnf['details']:
            s = BcbioSample.load_from_sample_info(
                sample_info,
                bcbio_project=self,
                include_samples=include_samples,
                exclude_samples=exclude_samples,
                extra_batches=extra_batches,
                silent=self.silent)
            if s:
                self.samples.append(s)

        if not self.samples:
            if exclude_samples:
                critical(f'Error: no samples left with the exclusion of '
                         f'batch/sample name(s): {", ".join(exclude_samples)}\n'
                         f'Available samples from the YAML file {self.bcbio_yaml_fpath}:\n'
                         f'{", ".join(all_sample_names)}\nbatches: {", ".join(all_batch_names)}')
            if include_samples:
                critical(f'Error: could not find a batch or a sample with the name(s): '
                         f'{", ".join(include_samples)}\n'
                         f'Available samples from the YAML file {self.bcbio_yaml_fpath}:\n'
                         f'{", ".join(all_sample_names)}\nbatches: {", ".join(all_batch_names)}')
            critical(f'Error: could not parse any batch or samples in the bcbio project. '
                     f'Please check the bcbio YAML file: {self.bcbio_yaml_fpath}')

        not_found_samples = [s.name for s in self.samples if not s.bam]
        if not_found_samples:
            if not self.silent: warn(f'Warning: no BAM files not found for {len(not_found_samples)}/{len(self.samples)} samples')

        self.samples.sort(key=lambda _s: _s.key_to_sort())
        self.batch_by_name = self.update_batches(self.samples, self.silent)

        def _check_dup_props(prop, is_critical=False):
            _vals = set([s_.__dict__.get(prop) for s_ in self.samples])
            if len(_vals) > 1:
                (critical if is_critical else err)('Got different ' + prop + ' values in samples in ' + self.project_name)
            else:
                self.__dict__[prop] = _vals.pop()
        _check_dup_props('genome_build')
        _check_dup_props('variant_regions_bed')
        _check_dup_props('coverage_bed')
        _check_dup_props('sv_regions_bed')
        _check_dup_props('is_rnaseq')
        _check_dup_props('min_allele_fraction')
        _check_dup_props('is_wgs', is_critical=False)
        _check_dup_props('coverage_interval', is_critical=False)
        if self.is_rnaseq:
            debug('RNAseq')
        elif self.coverage_interval:
            debug('Coverage interval: ' + str(self.coverage_interval))

        for s in self.samples:
            for caller in s.variantcallers:
                self.samples_by_caller[(caller, s.phenotype == 'germline')].append(s)

        debug('Done loading bcbio project ' + self.project_name)
Exemple #25
0
def main(genome=None, gtf_path=None, all_transcripts=False, principal=False, only_key_genes=False, gene_list=None,
         biotypes='', features=''):
    out = sys.stdout

    # GTF
    if not gtf_path:
        try:
            from hpc_utils import hpc
        except ImportError:
            critical('GTF file is needed. Either install hpc_utils, or provide GTF with --gtf')
        else:
            if genome == 'GRCh37':
                gtf_path = os.path.join(hpc.get_ref_file(key='pyensembl_data'), 'GRCh37/ensembl75/Homo_sapiens.GRCh37.75.gtf.gz')
            else:
                gtf_path = os.path.join(hpc.get_ref_file(key='pyensembl_data'), 'GRCh38/ensembl95/Homo_sapiens.GRCh38.95.gtf.gz')

    # Genes
    key_genes = None
    if only_key_genes:
        key_genes = get_key_genes_set()
    elif gene_list:
        key_genes = get_genes_from_file(gene_list)

    # Transcripts
    transcripts_by_gid = None
    if not all_transcripts:
        if principal:
            transcripts_by_gid = {
                k: [v] for k, v in
                canon_transcript_per_gene(genome, only_principal=True, use_gene_id=True).items()
            }
        else:
            transcripts_by_gid = canon_transcript_per_gene(genome, use_gene_id=True)

    # Options
    biotypes = biotypes.strip()
    if biotypes:
        biotypes = biotypes.split(',')
    features = features.strip()
    if features:
        features = features.split(',')

    genes_set = set()
    genes_without_canon = set()
    warn(f'Parsing {gtf_path}')
    with open_gzipsafe(gtf_path) as f:
        lines_cnt = 0
        region_cnt = 0
        for l in f:
            if not l.startswith('#') and l.strip():
                lines_cnt += 1
                fields = l.strip().split('\t')
                try:
                    chrom, _, feature, start, end, _, _, _, annotations = fields
                except:
                    warn(f'Cannot read fields {str(fields)}')
                    raise

                if genome.startswith('hg') and not chrom.startswith('chr'):
                    chrom = 'chr' + chrom

                if features:
                    if not any(feature == ft for ft in features):
                        continue

                annotations = {kv.split()[0].strip().strip('"'):
                                   kv.split()[1].strip().strip('"')
                               for kv in annotations.split('; ')}
                gene_name = annotations['gene_name']
                if only_key_genes and gene_name not in key_genes:
                    continue

                if biotypes:
                    biotype = annotations['gene_biotype']
                    if not any(bt == biotype for bt in biotypes):
                        continue

                if transcripts_by_gid:
                    gene_id = annotations['gene_id']
                    transcript_id = annotations['transcript_id']
                    canon_transcript_ids = transcripts_by_gid.get(gene_id)
                    if not canon_transcript_ids:
                        genes_without_canon.add(gene_name)
                        continue
                    if not transcript_id in canon_transcript_ids:
                        continue

                start = int(start) - 1
                end = int(end)
                if end - start >= 3:
                    out.write('\t'.join([chrom, str(start), str(end), gene_name]) + '\n')
                    genes_set.add(gene_name)
                    region_cnt += 1

        if region_cnt % 10000 == 0:
            warn(f'Processed {len(genes_set)} genes, written {region_cnt} regions...')

    warn(f'Done. Processed {len(genes_set)} genes, written {region_cnt} regions')
    if genes_without_canon:
        warn(f'No canonical transcript for {len(genes_without_canon)} gene ids')
Exemple #26
0
    def set_samples(self, bcbio_cnf, exclude_samples=None, include_samples=None):
        debug('Reading sample details...')
        exclude_samples = [s.replace('.', '_') for s in exclude_samples] if exclude_samples else None
        include_samples = [s.replace('.', '_') for s in include_samples] if include_samples else None

        # First pass - just to get extra batch IDs that we need to include to have batches consistent
        extra_batches = set()
        if include_samples:
            for sample_info in bcbio_cnf['details']:
                sname, batch_names = BcbioSample.parse_sample_ids(sample_info)
                if sname in include_samples:
                    for b in batch_names:
                        if b not in (include_samples or []) and b not in (exclude_samples or []):
                            extra_batches.add(b)

        # Second pass - including/excluding, and creating BcbioSample objects
        for sample_info in bcbio_cnf['details']:
            s = BcbioSample.load_from_sample_info(
                sample_info,
                bcbio_project=self,
                exclude_samples=exclude_samples,
                include_samples=include_samples,
                extra_batches=extra_batches,
                silent=self.silent)
            if s:
                self.samples.append(s)

        if not self.samples:
            if exclude_samples:
                critical(f'Error: no samples left with the exclusion of batch/sample name(s): {", ".join(exclude_samples)}.'
                         f'Check the YAML file for available options: {self.bcbio_yaml_fpath}.')
            if include_samples:
                critical(f'Error: could not find a batch or a sample with the name(s): {", ".join(include_samples)}. '
                         f'Check the YAML file for available options: {self.bcbio_yaml_fpath}')
            critical(f'Error: could not parse any batch or samples in the bcbio project. '
                     f'Please check the bcbio YAML file: {self.bcbio_yaml_fpath}')

        not_found_samples = [s.name for s in self.samples if not s.bam]
        if not_found_samples:
            if not self.silent: warn(f'Warning: no BAM files not found for {len(not_found_samples)}/{len(self.samples)} samples')

        self.samples.sort(key=lambda _s: _s.key_to_sort())
        self.batch_by_name = self.update_batches(self.samples, self.silent)

        def _check_dup_props(prop, is_critical=False):
            _vals = set([s_.__dict__.get(prop) for s_ in self.samples])
            if len(_vals) > 1:
                (critical if is_critical else err)('Got different ' + prop + ' values in samples in ' + self.project_name)
            else:
                self.__dict__[prop] = _vals.pop()
        _check_dup_props('genome_build')
        _check_dup_props('variant_regions_bed')
        _check_dup_props('coverage_bed')
        _check_dup_props('sv_regions_bed')
        _check_dup_props('is_rnaseq')
        _check_dup_props('min_allele_fraction')
        _check_dup_props('is_wgs', is_critical=False)
        _check_dup_props('coverage_interval', is_critical=False)
        if self.is_rnaseq:
            debug('RNAseq')
        elif self.coverage_interval:
            debug('Coverage interval: ' + str(self.coverage_interval))

        for s in self.samples:
            for caller in s.variantcallers:
                self.samples_by_caller[(caller, s.phenotype == 'germline')].append(s)

        debug('Done loading bcbio project ' + self.project_name)
Exemple #27
0
    def find_vcf_file(self, batch_name, silent=False, caller=None):
        caller = caller or self.somatic_caller
        vcf_fname = batch_name + '-' + caller + '.vcf'
        annot_vcf_fname = batch_name + '-' + caller + '-annotated.vcf'

        vcf_annot_fpath_gz = adjust_path(join(self.date_dir, annot_vcf_fname + '.gz'))  # in datestamp
        var_raw_vcf_annot_fpath_gz = adjust_path(join(self.raw_var_dir, annot_vcf_fname + '.gz'))  # in datestamp/var/raw

        vcf_fpath_gz = adjust_path(join(self.date_dir, vcf_fname + '.gz'))  # in datestamp
        var_vcf_fpath_gz = adjust_path(join(self.var_dir, vcf_fname + '.gz'))  # in datestamp/var
        var_raw_vcf_fpath_gz = adjust_path(join(self.raw_var_dir, vcf_fname + '.gz'))  # in datestamp/var/raw

        vcf_fpath = adjust_path(join(self.date_dir, vcf_fname))  # in datestamp
        var_vcf_fpath = adjust_path(join(self.var_dir, vcf_fname))  # in datestamp/var
        var_raw_vcf_fpath = adjust_path(join(self.raw_var_dir, vcf_fname))  # in datestamp/var/raw

        if isfile(vcf_annot_fpath_gz):
            verify_file(vcf_annot_fpath_gz, is_critical=True)
            if not silent: info('Found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz)
            return vcf_annot_fpath_gz
        else:
            debug('Not found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz)

        if isfile(var_raw_vcf_annot_fpath_gz):
            verify_file(var_raw_vcf_annot_fpath_gz, is_critical=True)
            if not silent: info('Found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz)
            return var_raw_vcf_annot_fpath_gz
        else:
            debug('Not found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz)

        if isfile(vcf_fpath_gz):
            verify_file(vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp dir ' + vcf_fpath_gz)
            return vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp dir ' + vcf_fpath_gz)

        if isfile(var_raw_vcf_fpath_gz):
            verify_file(var_raw_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz)
            return var_raw_vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz)

        if isfile(vcf_fpath):
            verify_file(vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp dir ' + vcf_fpath)
            return vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp dir ' + vcf_fpath)

        if isfile(var_raw_vcf_fpath):
            verify_file(var_raw_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath)
            return var_raw_vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath)

        if isfile(var_vcf_fpath_gz):
            verify_file(var_vcf_fpath_gz, is_critical=True)
            if not silent: info('Found VCF in the datestamp/var dir ' + var_vcf_fpath_gz)
            return var_vcf_fpath_gz
        else:
            debug('Not found VCF in the datestamp/var dir ' + var_vcf_fpath_gz)

        if isfile(var_vcf_fpath):
            verify_file(var_vcf_fpath, is_critical=True)
            if not silent: info('Found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath)
            return var_vcf_fpath
        else:
            debug('Not found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath)

        if not silent:
            warn('Warning: no VCF found for batch ' + batch_name + ', ' + caller + ', gzip or '
                'uncompressed version in the datestamp directory.')
        return None
Exemple #28
0
def main(prefix,
         output_bedpe,
         output_fasta=None,
         output_json=None,
         support=None,
         ensembl_release=None,
         peptide_flanking_len=None,
         debug=False):

    pizzly_flat_filt_fpath = prefix + '-flat-filtered.tsv'
    pizzly_json_fpath = prefix + '.json'
    input_fasta = prefix + '.fusions.fasta'
    output_bedpe = abspath(output_bedpe)

    logger.init(debug)

    ebl = EnsemblRelease(ensembl_release)

    # Reading filtered tsv
    filt_fusions = set()
    with open(pizzly_flat_filt_fpath) as f:
        for row in csv.DictReader(f, delimiter='\t'):
            filt_fusions.add((row['geneA.name'], row['geneB.name']))

    # Read json
    json_data = {'genes': []}
    with open(pizzly_json_fpath) as f:
        data = json.load(f)
        for g_event in data['genes']:
            gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
            if (gene_a, gene_b) in filt_fusions:
                json_data['genes'].append(g_event)

    # Read fasta
    fasta_dict = SeqIO.index(input_fasta, 'fasta')

    filt_json_data = {'genes': []}
    filt_fasta_records = []
    filt_event_count = 0
    filt_transcript_event_count = 0

    # Write bedpe
    with open(output_bedpe, 'w') as bedpe_fh:
        bedpe_header = [
            'chr 5p',
            'start 5p',
            'end 5p',
            'chr 3p',
            'start 3p',
            'end 3p',
            'name',
            'tier',
            'strand 5p',
            'strand 3p',
            'support',
            'is canon bndry',
            'inframe',
            'peptide',
            'fusion pos',
            'nt in the break',
            'transcripts',
            'is canon intron dinuc',
        ]
        bedpe_writer = csv.DictWriter(bedpe_fh,
                                      fieldnames=bedpe_header,
                                      delimiter='\t')
        bedpe_writer.writeheader()

        for g_event in json_data[
                'genes']:  # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'}
            gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
            logger.info(gene_a + '>>' + gene_b)

            # # first pass to select the longest transcripts
            # def _longest_tx(key):
            #     return max((ebl.transcript_by_id(te[f'transcript{key}']['id']) for te in g_event['transcripts']), key=lambda t: len(t))
            # a_tx = _longest_tx('A')
            # b_tx = _longest_tx('B')
            # print(f'Longest transcriptA: {a_tx.id}, Longest transcriptB: {b_tx.id}')
            # try:
            #     t_event = [te for te in g_event['transcripts'] if te['transcriptA']['id'] == a_tx.id and te['transcriptB']['id'] == b_tx.id][0]
            # except:
            #     print(f"No event with 2 longest transcripts. Available events: {', '.join(te['transcriptA']['id'] +
            #           '>>' + te['transcriptB']['id'] for te in g_event['transcripts'])}")
            #     raise

            filt_g_event = {
                k: v
                for k, v in g_event.items() if k != 'readpairs'
            }
            filt_g_event['transcripts'] = []

            met_event_keys = set(
            )  # collecting to get rid of duplicate transcript events
            met_peptide_keys = set(
            )  # collecting to get rid of duplicate peptides
            bedpe_entries = []
            for t_event in g_event['transcripts']:
                if t_event['support'] < support:
                    continue

                fusion = Fusion.create_from_pizzly_event(ebl, t_event)
                if not fusion:  # not a good transcript
                    continue

                # skipping duplicate events
                k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.side_5p.bp_offset, fusion.side_3p.bp_offset
                if k in met_event_keys: continue
                met_event_keys.add(k)

                # for writing filtered json
                filt_g_event['transcripts'].append(t_event)
                filt_transcript_event_count += 1

                # writing bedpe
                entry = fusion.to_bedpe(peptide_flanking_len)
                if not entry:
                    continue

                # skipping duplicate peptides
                k = entry['name'], entry['peptide']
                if k in met_peptide_keys: continue
                met_peptide_keys.add(k)

                bedpe_entries.append(entry)

                # for writing filtered fasta
                pizzly_fasta_rec = fasta_dict[t_event['fasta_record']]
                _check_fusion_fasta(pizzly_fasta_rec, fusion)
                filt_fasta_records.append(pizzly_fasta_rec)

                if fusion.peptide:
                    _verify_peptides(pizzly_fasta_rec, fusion,
                                     peptide_flanking_len)

            if not bedpe_entries:
                logger.warn(
                    f'All transcript events filtered out for fusion {gene_a}>>{gene_b}, skipping'
                )
            else:
                filt_json_data['genes'].append(filt_g_event)
                filt_event_count += 1
                for bedpe_entry in bedpe_entries:
                    bedpe_writer.writerow(bedpe_entry)

    # _test_pvac(output_bedpe)

    # Write filtered json
    if output_json:
        with open(output_json, 'w') as f:
            json.dump(filt_json_data, f, indent=4)

    # Write fasta
    if output_fasta:
        SeqIO.write(filt_fasta_records, output_fasta, 'fasta')

    logger.info()
    logger.info(f'Written {filt_transcript_event_count} transcript events '
                f'for {filt_event_count} fusions into bedpe: {output_bedpe}')
Exemple #29
0
    def to_bedpe(self, peptide_flanking_len=None):
        bp_genomic_pos_5p, bp_in_intron_5p = self.side_5p.calc_genomic_bp_offset(
        )
        if bp_genomic_pos_5p == -1:
            logger.warn(
                f'  Fusion in {self} takes the whole 5p transcript {self.side_5p.trx.id}. That\'s suspicious, so we are skipping it.'
            )
            return None

        bp_genomic_pos_3p, bp_in_intron_3p = self.side_3p.calc_genomic_bp_offset(
        )
        if bp_genomic_pos_3p == -1:
            logger.warn(
                f'  Fusion in {self} takes the whole 3p transcript {self.side_3p.trx.id}. That\'s suspicious, so we are skipping it.'
            )
            return None

        self.is_canonical_boundary = bp_in_intron_5p and bp_in_intron_3p

        entry = {
            'chr 5p': self.side_5p.trx.contig,
            'start 5p':
            -1 if self.side_5p.trx.strand == '+' else bp_genomic_pos_5p,
            'end 5p':
            -1 if self.side_5p.trx.strand == '-' else bp_genomic_pos_5p,
            'chr 3p': self.side_3p.trx.contig,
            'start 3p':
            bp_genomic_pos_3p if self.side_3p.trx.strand == '+' else -1,
            'end 3p':
            bp_genomic_pos_3p if self.side_3p.trx.strand == '-' else -1,
            'name':
            self.side_5p.trx.gene.name + '>>' + self.side_3p.trx.gene.name,
            'tier': self.tier,
            'strand 5p': self.side_5p.trx.strand,
            'strand 3p': self.side_3p.trx.strand,
            'support': self.support,
            'is canon bndry': 'NA',
            'inframe': 'NA',
            'peptide': 'NA',
            'fusion pos': 'NA',
            'nt in the break': 'NA',
            'transcripts': 'NA',
            'is canon intron dinuc': 'NA',
        }
        self.make_peptide(peptide_flanking_len)
        if self.peptide:
            # ENST00000304636|ENST00000317840;ENST00000377795;ENST00000009530|ENST00000353334
            # 5' transcripts                 ;3' transcripts ;3' frameshift transcripts
            trx_line = self.side_5p.trx.transcript_id + ';' + \
                       (self.side_3p.trx.id if self.is_inframe else '') + ';' + \
                       (self.side_3p.trx.id if not self.is_inframe else '')

            entry.update({
                'is canon bndry':
                '1' if self.is_canonical_boundary else '0',
                'inframe':
                '1' if self.is_inframe else '0',
                'peptide':
                self.peptide,
                'fusion pos':
                self.fusion_offset_in_peptide,
                'nt in the break':
                self.num_of_nt_in_the_break,
                'transcripts':
                trx_line,
            })
        # fields += [self.side_a.transcript.transcript_id + ':' + str(len(self.side_a.transcript)),
        #            self.side_a.t_start, self.side_a.t_end]
        # fields += [self.side_b.transcript.transcript_id + ':' + str(len(self.side_b.transcript)),
        #            self.side_b.t_start, self.side_b.t_end]
        return entry
Exemple #30
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome +
             ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug(
                'The male non-PAR region does not overlap with the capture target - cannot determine sex.'
            )
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info(
        'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.'
    )
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' +
              str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
              ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
Exemple #31
0
def create_dragen_paired_directories_from_config(smconfig):
    # Set subject identifier
    tumor_subject_id_inferred = get_subject_id_from_dragen_dir(
        smconfig['dragen_somatic_dir'])
    normal_subject_id_inferred = get_subject_id_from_dragen_dir(
        smconfig['dragen_germline_dir'])
    if smconfig.get('dragen_subject_id'):
        subject_id = smconfig.get('dragen_subject_id')
    elif tumor_subject_id_inferred == None:
        critical(
            'could not infer subject id from somatic dir, please specify with --dragen_subject_id'
        )
    elif normal_subject_id_inferred == None:
        critical(
            'could not infer subject id from germline dir, please specify with --dragen_subject_id'
        )
    elif tumor_subject_id_inferred != normal_subject_id_inferred:
        critical(
            f'got different subject ids from somatic ({tumor_subject_id_inferred}) and germline'
            f' ({normal_subject_id_inferred}) dirs, please specify with --dragen_subject_id'
        )
    else:
        subject_id = tumor_subject_id_inferred
    # Set tumor identifier
    tumor_samples_inferred = get_samples_from_dragen_dir_bams(
        smconfig['dragen_somatic_dir'])
    if smconfig.get('dragen_tumor_id'):
        tumor_id = smconfig.get('dragen_tumor_id')
        if tumor_id != tumor_samples_inferred['tumor']:
            warn(
                f'provided DRAGEN tumor id ({tumor_id}) doesn\'t match id collected'
                f' from discovered BAM file ({tumor_samples_inferred["tumor"]})'
            )
    else:
        tumor_id = tumor_samples_inferred['tumor']
    # Set normal identifier
    normal_samples_inferred = get_samples_from_dragen_dir_bams(
        smconfig['dragen_germline_dir'])
    if smconfig.get('dragen_normal_id'):
        normal_id = smconfig.get('dragen_normal_id')
        if normal_id != normal_samples_inferred['normal']:
            warn(
                f'provided DRAGEN normal id ({normal_id}) doesn\'t match id collected'
                f' from discovered BAM file ({normal_samples_inferred["normal"]})'
            )
        if 'normal' in tumor_samples_inferred and normal_id != tumor_samples_inferred[
                'normal']:
            warn(
                f'provided DRAGEN normal id ({normal_id}) doesn\'t match id collected'
                f' from discovered BAM file ({tumor_samples_inferred["normal"]})'
            )
    else:
        normal_id = normal_samples_inferred['normal']
    # Create datastructure used for DragenProject init
    return [{
        'subject_id': subject_id,
        'tumor_normal_run': {
            'normal': normal_id,
            'tumor': tumor_id,
            'path': smconfig['dragen_somatic_dir'],
            'prefix': get_dragen_output_prefix(smconfig['dragen_somatic_dir'])
        },
        'normal_run': {
            'normal': normal_id,
            'path': smconfig['dragen_germline_dir'],
            'prefix': get_dragen_output_prefix(smconfig['dragen_germline_dir'])
        },
    }]
Exemple #32
0
def main(genome=None,
         input_genomes_url=None,
         gtf_path=None,
         all_transcripts=False,
         principal=False,
         gene_list=None,
         biotypes='',
         features='',
         gene_contains=None):
    out = sys.stdout

    # GTF
    if not gtf_path:
        try:
            from reference_data import api as refdata
        except ImportError:
            critical(
                'GTF file is needed. Either install reference_data, or provide GTF with --gtf'
            )
        else:
            refdata.find_genomes_dir(input_genomes_url)
            if genome == 'GRCh37':
                gtf_path = os.path.join(
                    refdata.get_ref_file(genome, key='pyensembl_data'),
                    'GRCh37/ensembl75/Homo_sapiens.GRCh37.75.gtf.gz')
            else:
                gtf_path = os.path.join(
                    refdata.get_ref_file(genome, key='pyensembl_data'),
                    'GRCh38/ensembl95/Homo_sapiens.GRCh38.95.gtf.gz')

    # Genes
    target_genes = None
    if gene_list:
        target_genes = get_genes_from_file(gene_list)

    # Transcripts
    transcripts_by_gid = None
    if not all_transcripts:
        if principal:
            transcripts_by_gid = {
                k: [v]
                for k, v in canon_transcript_per_gene(
                    genome, only_principal=True, use_gene_id=True).items()
            }
        else:
            transcripts_by_gid = canon_transcript_per_gene(genome,
                                                           use_gene_id=True)

    # Options
    biotypes = biotypes.strip()
    if biotypes:
        biotypes = biotypes.split(',')
    features = features.strip()
    if features:
        features = features.split(',')

    genes_set = set()
    genes_without_canon = set()
    warn(f'Parsing {gtf_path}')
    with open_gzipsafe(gtf_path) as f:
        lines_cnt = 0
        region_cnt = 0
        for l in f:
            if not l.startswith('#') and l.strip():
                lines_cnt += 1
                fields = l.strip().split('\t')
                try:
                    chrom, _, feature, start, end, _, strand, _, annotations = fields
                except:
                    warn(f'Cannot read fields {str(fields)}')
                    raise

                if genome.startswith('hg') and not chrom.startswith('chr'):
                    chrom = 'chr' + chrom

                if features:
                    if not any(feature == ft for ft in features):
                        continue

                annotations = {
                    kv.split()[0].strip().strip('"'):
                    kv.split()[1].strip().strip('"')
                    for kv in annotations.split('; ')
                }
                gene_name = annotations['gene_name']
                if target_genes and gene_name not in target_genes:
                    continue
                if gene_contains is not None and gene_contains not in gene_name:
                    continue

                if biotypes:
                    biotype = annotations['gene_biotype']
                    if not any(bt == biotype for bt in biotypes):
                        continue

                if transcripts_by_gid:
                    gene_id = annotations['gene_id']
                    transcript_id = annotations['transcript_id']
                    canon_transcript_ids = transcripts_by_gid.get(gene_id)
                    if not canon_transcript_ids:
                        genes_without_canon.add(gene_name)
                        continue
                    if transcript_id not in canon_transcript_ids:
                        continue

                start = int(start) - 1
                end = int(end)
                if end - start >= 3:
                    out.write('\t'.join(
                        [chrom,
                         str(start),
                         str(end), gene_name, '.', strand]) + '\n')
                    genes_set.add(gene_name)
                    region_cnt += 1

        if region_cnt % 10000 == 0:
            warn(
                f'Processed {len(genes_set)} genes, written {region_cnt} regions...'
            )

    warn(
        f'Done. Processed {len(genes_set)} genes, written {region_cnt} regions'
    )
    if genes_without_canon:
        warn(
            f'No canonical transcript for {len(genes_without_canon)} gene ids')
    def make_peptide(self, peptide_flanking_len=None):
        # 5' fasta
        if not self.side_5p.trx.contains_start_codon:
            logger.warn('No start codong in 5\' transcript')
            return

        transl_start = self.side_5p.trx.first_start_codon_spliced_offset
        if self.side_5p.bp_offset < transl_start:  # if the bp (t_end) falls before the beginning of translation
            return
        cds_seq_5p = Seq(
            self.side_5p.trx.sequence[transl_start:self.side_5p.bp_offset])
        fs_5p = len(cds_seq_5p) % 3
        if fs_5p != 0:
            logger.debug(f'  Frameshift of 5p sequence: {fs_5p}')

        pep_5p = _translate_from_start_codon(cds_seq_5p,
                                             to_stop=False,
                                             name='5\' fasta')
        if '*' in pep_5p:
            logger.info(
                f'   5\' petide has a STOP codon before breakpoint. Skipping.')
            assert min(self.side_5p.trx.stop_codon_spliced_offsets) < self.side_5p.bp_offset, \
                'We also expect pyensembl to report a STOP codon before the breakpoint'
            return

        # 3' fasta. Getting full sequence in case if it's an FS event that will produce a novel stop codon
        fs_3p = (self.side_3p.bp_offset -
                 self.side_3p.trx.first_start_codon_spliced_offset) % 3
        if fs_3p != 0:
            logger.debug(f'  Frameshift of 3p sequence: {fs_3p}')
        seq_3p = Seq(self.side_3p.trx.sequence[self.side_3p.bp_offset:])

        # checking if the fusion produced a frameshift
        fusion_fs = (fs_5p + fs_3p) % 3
        if fusion_fs != 0:
            logger.debug(f"  Result fusion frameshift:  {fusion_fs}")
        is_inframe = fusion_fs == 0

        # junction peptide
        junction_codon = cds_seq_5p[len(cds_seq_5p) - fs_5p:]
        if junction_codon:  # == fs_5p != 0:
            start_3p_from = 3 - fs_5p
            junction_codon += seq_3p[:start_3p_from]
            junction_pep = junction_codon.translate()
            if junction_pep == '*':
                logger.info(f'   Junction codon is STOP, skipping')
                return
        else:
            junction_pep = ''
            start_3p_from = 0

        # 3' peptide
        pep_3p = _trim3(seq_3p[start_3p_from:]).translate()
        if pep_3p[0] == '*':
            logger.info(
                f'   The new 3\' peptide starts from STOP ({seq_3p[start_3p_from:][:3]} '
                f'at position {self.side_3p.bp_offset}+{start_3p_from}), skipping translation.'
            )
            return
        if '*' not in pep_3p:
            logger.info(
                f'   No STOP codon in fused peptide, skipping translation.')
            return
        pep_3p = _trim3(seq_3p[start_3p_from:]).translate(to_stop=True)

        logger.debug(
            f'  5\' peptide (len={len(pep_5p)}): '
            f'{pep_5p if len(pep_5p) < 99 else pep_5p[:48] + "..." + pep_5p[-48:]}'
        )
        if junction_pep:
            logger.debug(f'  Junction peptide: {junction_pep}')
        logger.debug(
            f'  3\' peptide{f" (shifted by {fusion_fs} from original)" if fusion_fs else ""} '
            f'(len={len(pep_3p)}): '
            f'{pep_3p if len(pep_3p) < 99 else pep_3p[:48] + "..." + pep_3p[-48:]}'
        )

        # fusion peptide
        if peptide_flanking_len:
            # taking $(peptide_chunk_len) aminoacids from 5':
            pep_5p = pep_5p[-peptide_flanking_len:]
            # trying to make the total peptide to be of length $(peptide_chunk_len)*2+1:
            pep_3p = pep_3p[:peptide_flanking_len + 1 -
                            len(junction_pep)] if is_inframe else pep_3p

        fusion_pep = pep_5p + junction_pep + pep_3p
        assert '*' not in fusion_pep

        self.peptide = fusion_pep
        self.is_inframe = is_inframe
        self.fusion_offset_in_peptide = peptide_flanking_len or len(pep_5p)
        self.num_of_nt_in_the_break = fs_5p
Exemple #34
0
def _log(msg, silent, is_critical):
    if is_critical:
        critical(msg)
    if not silent:
        warn(msg)
def main(prefix,
         output_bedpe,
         output_fasta=None,
         output_json=None,
         min_read_support=None,
         ensembl_release=None,
         peptide_flanking_len=None,
         debug=False,
         no_filtering=False,
         check_transcript=True,
         pizzly_ref_fa=None,
         reads=None,
         min_tpm=None):

    # input_flat_fpath = prefix + '-flat.tsv'
    input_json_fpath = prefix + '.json'
    input_fasta = prefix + '.fusions.fasta'
    output_bedpe = abspath(output_bedpe)

    logger.init(debug)

    global ENSEMBL_RELEASE
    ENSEMBL_RELEASE = ensembl_release
    ebl = EnsemblRelease(ENSEMBL_RELEASE)

    # Reading filtered tsv
    # filt_fusions = set()
    # with open(input_flat_fpath) as f:
    #     for row in csv.DictReader(f, delimiter='\t'):
    #         filt_fusions.add((row['geneA.name'], row['geneB.name']))

    # Read json
    json_data = {'genes': []}
    with open(input_json_fpath) as f:
        data = json.load(f)
        for g_event in data['genes']:
            gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
            # if (gene_a, gene_b) in filt_fusions:
            json_data['genes'].append(g_event)

    # Read fasta
    fasta_dict = SeqIO.index(input_fasta, 'fasta')

    # First round: genomic coordinates and fasta
    logger.info(
        f'Round 1: reading {len(json_data["genes"])} gene-pairs events from pizzly JSON'
    )
    fusions = []
    for g_event in json_data[
            'genes']:  # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'}
        gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
        # logger.info(f'Processing event {gene_a}>>{gene_b}')

        met_fasta_keys = set(
        )  # collecting to get rid of duplicate transcript events
        for t_event in g_event['transcripts']:
            fusion = Fusion.create_from_pizzly_event(ebl, t_event)

            if check_transcript:
                if not _transcript_is_good(
                        fusion.side_5p.trx) or not _transcript_is_good(
                            fusion.side_3p.trx):
                    # logger.info(f'Transcripts {fusion.side_5p.trx} and {fusion.side_3p.trx} didn\'t pass check')
                    continue

            if no_filtering is not True and fusion.support < min_read_support:
                continue

            calc_positions_ok = fusion.calc_genomic_positions()
            if not calc_positions_ok:
                continue

            # comparing our fasta to pizzly fasta
            fusion.fasta_rec = fasta_dict[t_event['fasta_record']]
            _check_fusion_fasta(fusion.fasta_rec, fusion)

            # skipping duplicate fastas
            k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.fasta
            assert k not in met_fasta_keys
            met_fasta_keys.add(k)

            fusions.append(fusion)
        # if not met_fasta_keys:
        #     logger.info('   Filtered all fusions for this gene pair.')
        if met_fasta_keys:
            logger.info(
                f'Keeping {len(met_fasta_keys)} fusion(s) for the event {gene_a}-{gene_b}'
            )

    if not fusions:
        logger.warn('Finished: no fusions passed filtering')

    # Calculate expression of fused transcripts
    expr_by_fusion = None
    if reads and fusions:
        # filtered fasta for re-calling expression
        work_dir = safe_mkdir(splitext(output_bedpe)[0] + '_quant')
        fasta_path = join(work_dir, 'fusions.fasta')
        fasta_recs = [f.fasta_rec for f in fusions]
        SeqIO.write(fasta_recs, fasta_path, 'fasta')

        if pizzly_ref_fa:
            expr_by_fusion = requanitify_pizzly(pizzly_ref_fa, fasta_path,
                                                work_dir, reads)
            # expr_by_fusion = {fusion-fasta-id -> {length  eff_length  est_counts   tpm}}

    # Second round: peptides and expression
    logger.info()
    logger.info(
        f'Round 2: making peptides for {len(fusions)} events in '
        f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in fusions]))} genes pairs'
    )
    met_peptide_keys = set()  # collecting to get rid of duplicate peptides
    bedpe_entries = []
    peptide_fusions = []
    if peptide_flanking_len < 0:
        peptide_flanking_len = None
    for fusion in fusions:
        if fusion.side_3p.trx.contains_start_codon:
            logger.info(
                f'Translating {fusion.side_5p.trx.gene.name}>>{fusion.side_3p.trx.gene.name} fusion: {fusion}'
            )
            fusion.make_peptide(peptide_flanking_len)
            if fusion.peptide:
                _verify_peptides(fusion.fasta_rec, fusion,
                                 peptide_flanking_len)

            # skipping duplicate peptides
            k = fusion.side_5p.trx.gene.name, fusion.side_3p.trx.gene.name, fusion.peptide
            if k in met_peptide_keys:
                logger.debug(f'Skipping peptide {k}: already added')
                continue
            met_peptide_keys.add(k)

        # writing bedpe
        entry = fusion.to_bedpe()

        # add expression
        if expr_by_fusion:
            entry.update(expr_by_fusion[fusion.fasta_rec.id])
            tpm = float(entry['tpm'])
            if no_filtering is not True and tpm < min_tpm:
                logger.debug(
                    f'Skipping peptide {entry}: TPM={tpm} is below {min_tpm}')
                continue

        if fusion.peptide:
            peptide_fusions.append(fusion)
        bedpe_entries.append(entry)

    # Writing bedpe
    with open(output_bedpe, 'w') as bedpe_fh:
        bedpe_header = [
            'chr 5p',
            'start 5p',
            'end 5p',
            'chr 3p',
            'start 3p',
            'end 3p',
            'name',
            'tier',
            'strand 5p',
            'strand 3p',
            'support',
            'is canon bndry',
            'inframe',
            'peptide',
            'fusion pos',
            'nt in the break',
            'transcripts',
            'is canon intron dinuc',
        ]
        if expr_by_fusion:
            bedpe_header.extend(list(expr_by_fusion.values())[0].keys())
        bedpe_writer = csv.DictWriter(bedpe_fh,
                                      fieldnames=bedpe_header,
                                      delimiter='\t')
        bedpe_writer.writeheader()
        for bedpe_entry in bedpe_entries:
            bedpe_writer.writerow(bedpe_entry)

    # _test_pvac(output_bedpe)

    # Write fasta
    if output_fasta:
        SeqIO.write([f.fasta_rec for f in peptide_fusions], output_fasta,
                    'fasta')

    logger.info()
    logger.info(
        f'Written {len(peptide_fusions)} fusions in '
        f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in peptide_fusions]))} '
        f'gene pairs good peptides bedpe: {output_bedpe}')
Exemple #36
0
def _log(msg, silent, is_critical):
    if is_critical:
        critical(msg)
    if not silent:
        warn(msg)