Example #1
def find_bams(args):
    bam_by_sample = OrderedDict()
    bad_bam_fpaths = []

    good_args = []
    for arg in args:
        # /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0159_AHK2KTADXX/bcbio,Kudos159 /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0160_BHKWMNADXX/bcbio,Kudos160
        fpath = arg.split(',')[0]
        fname, ext = splitext(fpath)
        if ext == '.bam':
            bam_fpath = verify_bam(fpath)
            if not bam_fpath:
                if len(arg.split(',')) > 1:
                    sname = arg.split(',')[1]
                    sname = basename(splitext(bam_fpath)[0])
                bam_by_sample[sname] = bam_fpath
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs: ' +
                 ', '.join(bad_bam_fpaths))
    for arg in good_args:

    return bam_by_sample
Example #2
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' +
             ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [
        verify_file(fpath) for fpath in args
        if adjust_path(fpath) not in bam_by_sample
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) +
             ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' +
                     ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
Example #3
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}'

    if not fai_fpath:
        fai_fpath = get_fai(genome)
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
Example #4
def safe_mkdir(dirpath, descriptive_name=''):
    """ Multiprocessing-safely and recursively creates a directory
    if not dirpath:
        critical(f'Path is empty: {descriptive_name if descriptive_name else ""}')

    if isdir(dirpath):
        return dirpath

    if isfile(dirpath):
        critical(descriptive_name + ' ' + dirpath + ' is a file.')

    num_tries = 0
    max_tries = 10

    while not exists(dirpath):
        # we could get an error here if multiple processes are creating
        # the directory at the same time. Grr, concurrency.
        except OSError as e:
            if num_tries > max_tries:
            num_tries += 1
    return dirpath
Example #5
    def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False):
        if not date_dir:
            fc_date = bcbio_cnf.get('fc_date')
            fc_name = bcbio_cnf.get('fc_name') or 'project'
            if fc_date:
                # Date dirpath is from bcbio and named after fc_name, not our own project name
                date_dir = join(final_dir, fc_date + '_' + fc_name)
                if not create_dir and not verify_dir(date_dir, silent=True):
                    critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}')
                if isdir(join(final_dir, 'project')):  # bcbio-CWL?
                    date_dir = join(final_dir, 'project')
                    if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir)
                    regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}']
                    date_dirs = [join(final_dir, dirpath)
                                 for dirpath in listdir(final_dir)
                                 if any(re.match(regex, dirpath) for regex in regexs)]
                    if len(date_dirs) == 0:
                        raise NoDateStampsException('Error: no datestamp directory!')
                    elif len(date_dirs) == 1:
                        date_dir = date_dirs[0]
                        dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs]
                        newest_date, newest_dir = sorted(dates, reverse=True)[0]
                        newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir]
                        if len(newest_dirs) > 1:
                            raise MultipleDateStampsException(f'Error: multiple datestamp directory found, '
                               f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}')
                        date_dir = newest_dirs[0]

                    if not silent: info('Using the datestamp dir: ' + date_dir)
        if create_dir:
        return date_dir
def main():
    options = [
        (['-g', '--genome'], dict(
            help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES),
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
    info('Done, saved to ' + output_fpath)
Example #7
def extract_features(output_file, genome, only_canonical, high_confidence, coding_only,
    """ For debug purposes
    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    feature_types = feature_types or ['exon', 'CDS', 'stop_codon', 'transcript']
    features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in feature_types)
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    debug(f'Saved features to {output_file}')
Example #8
    def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False):
        if not date_dir:
            fc_date = bcbio_cnf.get('fc_date')
            fc_name = bcbio_cnf.get('fc_name') or 'project'
            if fc_date:
                # Date dirpath is from bcbio and named after fc_name, not our own project name
                date_dir = join(final_dir, fc_date + '_' + fc_name)
                if not create_dir and not verify_dir(date_dir, silent=True):
                    critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}')
                if isdir(join(final_dir, 'project')):  # bcbio-CWL?
                    date_dir = join(final_dir, 'project')
                    if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir)
                    regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}']
                    date_dirs = [join(final_dir, dirpath)
                                 for dirpath in listdir(final_dir)
                                 if any(re.match(regex, dirpath) for regex in regexs)]
                    if len(date_dirs) == 0:
                        raise NoDateStampsException('Error: no datestamp directory!')
                    elif len(date_dirs) == 1:
                        date_dir = date_dirs[0]
                        dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs]
                        newest_date, newest_dir = sorted(dates, reverse=True)[0]
                        newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir]
                        if len(newest_dirs) > 1:
                            raise MultipleDateStampsException(f'Error: multiple datestamp directory found, '
                               f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}')
                        date_dir = newest_dirs[0]

                    if not silent: info('Using the datestamp dir: ' + date_dir)
        if create_dir:
        return date_dir
Example #9
def safe_mkdir(dirpath, descriptive_name=''):
    """ Multiprocessing-safely and recursively creates a directory
    if not dirpath:
        critical(f'Path is empty: {descriptive_name if descriptive_name else ""}')

    if isdir(dirpath):
        return dirpath

    if isfile(dirpath):
        critical(descriptive_name + ' ' + dirpath + ' is a file.')

    num_tries = 0
    max_tries = 10

    while not exists(dirpath):
        # we could get an error here if multiple processes are creating
        # the directory at the same time. Grr, concurrency.
        except OSError as e:
            if num_tries > max_tries:
            num_tries += 1
    return dirpath
Example #10
def get_or_create_run(projects, parall_view=None):
    genomes = set([p.genome for p in projects])
    if len(genomes) > 1:
        log.critical('Error: multiple genomes in projects: ' + str(genomes))
    run = Run.find_by_projects(projects)

    if run and run.rerun_on_usercall:
        log.info('Rebuilding tree on usercall')
        run.rerun_on_usercall = False
        return run

    if run and not Run.is_ready(run):
        log.debug('Tree files do not exist, recreating run for projects ' +
                  ', '.join(p.name for p in projects))
        run = None

    if run:
        log.debug('Found run for ' + ', '.join([p.name for p in projects]) +
                  ' with ID ' + str(run.id))
        log.debug('Creating new run for projects ' +
                  ', '.join(p.name for p in projects))
        run = Run.create(projects, parall_view)
        log.debug('Done creating new run with ID ' + str(run.id))
    return run
Example #11
def tx_tmpdir(base_dir, rollback_dirpath):
    """Context manager to create and remove a transactional temporary directory.
    # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    # unique_attempts = 0
    # while os.path.exists(tmp_dir_base):
    #     if unique_attempts > 5:
    #         break
    #     tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    #     time.sleep(1)
    #     unique_attempts += 1

    # if base_dir is not None:
    #     tmp_dir_base = os.path.join(base_dir, "tx")
    # else:
    #     tmp_dir_base = os.path.join(os.getcwd(), "tx")
    if exists(rollback_dirpath):
        critical(rollback_dirpath + ' already exists')

    tmp_dir = tempfile.mkdtemp(dir=base_dir)
        yield tmp_dir
        if tmp_dir and exists(tmp_dir):
            os.rename(tmp_dir, rollback_dirpath)
Example #12
def get_ref_fasta(genome):
    if is_az():
        path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa'
        if isfile(path):
            logger.info('Found genome fasta at ' + path)
            return path

    if isdir(join(DATA_DIR, 'genomes', genome)):
        genome_dir = safe_mkdir(join(DATA_DIR, 'genomes'))
        genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes'))
    if genome not in genomepy.list_installed_genomes(genome_dir):
        genome_rec = [
            rec for rec in genomepy.list_available_genomes()
            if rec[1] == genome
        if genome_rec:
            genome_rec = genome_rec[0]
            logger.critical('Error: genome ' + genome + ' is not available')
        logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] +
                    ' and installing into ' + genome_dir)
        genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir)
    genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename
    return genome_fasta_file
Example #13
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            if not silent:
                critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                         f'{self.bcbio_project.final_dir}. Please check consistency between the YAML '
                         f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: '
                         f'to every "description" value in YAML, there should be a corresponding folder with the '
                         f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                         f'from consideration, if you are sure that missing folders are expected.')
                return False
        self.var_dirpath = join(self.dirpath, BcbioProject.var_dir)

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
                if not silent: warn('Counts for ' + self.name + ' not found')
            if variantcallers_data:
                self._set_variant_files(variantcallers_data, ensemble=ensemble)
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
        return True
Example #14
    def update_batches(samples, silent=False):
        batch_by_name = {bn: Batch(bn) for bn in list(set([b for s in samples for b in s.batch_names]))}
        for sample in samples:
            for bn in sample.batch_names:
                batch_by_name[bn].name = bn
                sample.batch = batch_by_name[bn]
                if sample.phenotype == 'normal':
                    if batch_by_name[bn].normal:
                        critical('Multiple normal samples for batch ' + bn)
                    batch_by_name[bn].normal = sample
                    batch_by_name[bn].tumor = sample

        for batch in batch_by_name.values():
            if batch.normal and not batch.tumor:
                if not silent: info('Batch ' + batch.name + ' contains only normal, treating sample ' + batch.normal.name + ' as tumor')
                batch.normal.phenotype = 'tumor'
                batch.normal.batch = batch
                batch.tumor = batch.normal
                batch.normal = None

        # setting up batch properties
        for b in batch_by_name.values():
            b.tumor.normal_match = b.normal

        return batch_by_name
Example #15
def find_bams(args):
    bam_by_sample = OrderedDict()
    bad_bam_fpaths = []

    good_args = []
    for arg in args:
        # /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0159_AHK2KTADXX/bcbio,Kudos159 /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0160_BHKWMNADXX/bcbio,Kudos160
        fpath = arg.split(',')[0]
        fname, ext = splitext(fpath)
        if ext == '.bam':
            bam_fpath = verify_bam(fpath)
            if not bam_fpath:
                if len(arg.split(',')) > 1:
                    sname = arg.split(',')[1]
                    sname = basename(splitext(bam_fpath)[0])
                bam_by_sample[sname] = bam_fpath
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs: ' + ', '.join(bad_bam_fpaths))
    for arg in good_args:

    return bam_by_sample
Example #16
def tx_tmpdir(base_dir, rollback_dirpath):
    """Context manager to create and remove a transactional temporary directory.
    # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    # unique_attempts = 0
    # while os.path.exists(tmp_dir_base):
    #     if unique_attempts > 5:
    #         break
    #     tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4()))
    #     time.sleep(1)
    #     unique_attempts += 1

    # if base_dir is not None:
    #     tmp_dir_base = os.path.join(base_dir, "tx")
    # else:
    #     tmp_dir_base = os.path.join(os.getcwd(), "tx")
    if exists(rollback_dirpath):
        critical(rollback_dirpath + ' already exists')

    tmp_dir = tempfile.mkdtemp(dir=base_dir)
        yield tmp_dir
        if tmp_dir and exists(tmp_dir):
            os.rename(tmp_dir, rollback_dirpath)
Example #17
def make_cluster_cmdl(log_dir, refdata, app_name, cluster_submit_cmd=None):
    """ Generates cluster command line parameters for snakemake
    if not cluster_submit_cmd and not refdata.cluster_cmd:
        logger.critical(f'Automatic cluster submission '
            f'is not supported for the machine "{refdata.name}". '
            f'Use exclicit --cluster-cmd')
    if not cluster_submit_cmd:
        cluster_submit_cmd = refdata.cluster_cmd
    # Replacing the curly braces to avoid confusing snakemake formatter which for some reason triggers
    cluster_submit_cmd = cluster_submit_cmd.replace('{', '[').replace('}', ']')

    cluster_submitter = get_submit_script()
    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    from reference_data import api as refdata
    cluster_cmdl = \
        f' --cluster "{cluster_submitter} {timestamp} {log_dir} {app_name} ' \

    # Also overriding jobscript?
    jobscript = refdata.cluster_jobscript
    if jobscript:
        jobscript_file = join(log_dir, 'jobscript.sh')
        with open(jobscript_file, 'w') as f_out:
            f_out.write(jobscript.replace('{path}', os.environ["PATH"]))
        cluster_cmdl += f' --jobscript "{jobscript_file}"'

    return cluster_cmdl
Example #18
def _get(relative_path, genome=None):
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
        return path
Example #19
def sort_bed_gsort(input_bed_fpath,
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),

    return output_bed_fpath
Example #20
    def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False):
        self.raw_name = name
        self.name = self.raw_name.replace('.', '_')
        self.rgid = self.name
        self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name))
        if not verify_dir(self.dirpath, silent=silent):
            critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory '
                     f'{self.parent_project.final_dir}. Please check consistency between the YAML '
                     f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: '
                     f'to every "description" value in YAML, there should be a corresponding folder with the '
                     f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) '
                     f'from consideration, if you are sure that missing folders are expected.')

        self.bam = self.find_bam(silent=silent)

        if self.is_rnaseq:
            gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts'))
            if isfile(gene_counts) and verify_file(gene_counts):
                self.counts_file = gene_counts
                if not silent: warn('Counts for ' + self.name + ' not found')
            if variantcallers_data:
                self._set_variant_callers(variantcallers_data, ensemble=ensemble)
                if not silent: warn('No variant callers set in config, skipping finding VCF files')
Example #21
def cnv_to_bed(cnv_path, out_bed_path):
    with open(cnv_path) as fh:
        parse_fn = None
        header = next(fh).strip().split('\t')

        if header[0].startswith('##fileformat=VCF'):
            # Manta
            info(f'Detected {cnv_path} as caller "manta"')
            parse_fn = iter_manta

            for caller, hdr in header_by_caller.items():
                if header == hdr:
                        f'Parsing {cnv_path} as caller "{caller}" with header {hdr}'
                    parse_fn = get_iter_cnv(header,

        if not parse_fn:
            critical(f'Cannot detect CNV file format in {cnv_path}')

    with open(out_bed_path, 'w') as out:
        writer = csv.writer(out, delimiter='\t')
        for i, call in enumerate(parse_fn(cnv_path)):
            if call:
                bed_row = call.get_bed_raw()
                if i == 0:
Example #22
def get_chrom_lengths(genome=None, fai_fpath=None):
    assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}'

    if not fai_fpath:
        fai_fpath = get_fai(genome)
        fai_fpath = verify_file(fai_fpath, is_critical=True)
        if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'):
            critical('Error: .fai or .fa is accepted.')

    chr_lengths = []

    if fai_fpath.endswith('.fa'):
        debug('Reading genome sequence (.fa) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            from Bio import SeqIO
            reference_records = SeqIO.parse(handle, 'fasta')
            for record in reference_records:
                chrom = record.id
                chr_lengths.append((chrom, len(record.seq)))

        debug('Reading genome index file (.fai) to get chromosome lengths')
        with open(fai_fpath, 'r') as handle:
            for line in handle:
                line = line.strip()
                if line:
                    chrom, length = line.split()[0], int(line.split()[1])
                    chr_lengths.append((chrom, length))

    return chr_lengths
Example #23
def lift_over(fpath, from_genome, to_genome):
    chain_file = join(dirname(__file__), 'over.chain', f'{from_genome}To{to_genome.title()}.over.chain.gz')
    if not verify_file(chain_file):
        log.critical(f'Error: conversion from {from_genome} to {to_genome} is not supported!')
    out_fpath = add_suffix(fpath, to_genome)
    call_process.run(f'liftOver {fpath} {chain_file} {out_fpath} {out_fpath}.unMapped')
    return out_fpath
Example #24
def pair_dragen_directories(paths):
    # DRAGEN tumor/normal and normal directories are paired on the basis of the normal sample
    # name.
    # Tumor and normal sample names are extracted from the BAM header. Specifically, the BAM
    # sample name is retrieved from the 'SM' (sample) field of the '@RG' (read group) header line.
    # Tumor or normal identity of a sample is inferred from the BAM filename: if a BAM filename
    # contains the '_tumor.bam' suffix then it and the sample name is set as the tumor, otherwise
    # set as the normal sample.
    # The subject identifier is from the DRAGEN output directory name.
    # Assumes a one-to-one pairing for DRAGEN tumor/normal and normal output directories i.e. no
    # multiple tumor/normal runs to a single normal run.

    # Sort paths by normal sample name so that normal and tumor/normal are placed together
    paths_sorted = dict()
    for path in paths:
        dir_type = 'tumor_normal_run' if is_dragen_tumor_normal_directory(
            path) else 'normal_run'
        samples = get_samples_from_dragen_dir_bams(path)
        # Ensure we have found normal names
        if 'normal' not in samples:
                f'Could not find normal sample name for DRAGEN directory {path}'
        # Sort by normal sample name, add path, subject ID to stored data
        sample_normal = samples['normal']
        if sample_normal not in paths_sorted:
            paths_sorted[sample_normal] = dict()
        assert dir_type not in paths_sorted[sample_normal]
        paths_sorted[sample_normal][dir_type] = samples
        paths_sorted[sample_normal][dir_type]['path'] = path
            'prefix'] = get_dragen_output_prefix(path)
            'subject_id'] = get_subject_id_from_dragen_dir(path)

    # Differentiated paired and unpaired paths
    paths_unpaired = list()
    paths_paired = list()
    for paths in paths_sorted.values():
        if 'normal_run' in paths and 'tumor_normal_run' in paths:
            # Ensure we have collected only one subject id for this set of inputs
            assert len({d['subject_id'] for d in paths.values()}) == 1
            paths['subject_id'] = paths['normal_run']['subject_id']
            for dir_type, data in paths.items():
                paths_unpaired.append((dir_type, data['path']))
    # Emit warning for unpaired paths
    if paths_unpaired:
        paths_unpaired_strs = list()
        for dir_type, path in paths_unpaired:
            paths_unpaired_strs.append(f'{dir_type}: {path}')
        paths_unpaired_str = '\n\t'.join(paths_unpaired_strs)
        warn(f'could not pair DRAGEN directories:\n\t{paths_unpaired_str}')
    return paths_paired
Example #25
def is_small_target(bed_file=None):
    try:  # to allow optional pybedtools
        from ngs_utils.bed_utils import get_total_bed_size
    except ImportError:
        critical('Please, install pybedtools (conda install -c bioconda -y pybedtools)')
        return bed_file and isfile(bed_file) and get_total_bed_size(bed_file) < 10 * 1000 * 1000
Example #26
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    return sex
Example #27
 def calc_genomic_bp_offset(self):
     genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord(
         self.trx, self.bp_offset)
     if genomic_coord is None:
             f'  Error: could not convert transcript {id} offest {genomic_coord} to genomic coordinate'
         return None
     return genomic_coord, is_in_intron
Example #28
def get_dragen_output_prefix(dirpath):
    for fp in dirpath.iterdir():
        if not fp.match('*replay.json'):
        return fp.name.replace('-replay.json', '')
            'could not determine output prefix for DRAGEN directory \'{dirpath}\''
Example #29
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
        dic = load_yaml(open(fpath))
    except Exception:
        critical('Could not parse bcbio YAML ' + fpath)
        return dic
Example #30
def load_yaml_config(fpath):
    verify_file(fpath, is_critical=True)
        dic = _load_yaml(fpath)
    except Exception:
        critical('Could not parse bcbio YAML ' + fpath)
        return dic
Example #31
 def find_in_log(self, fname, is_critical=False, silent=True):
     options = [join(self.log_dir, fname),
                join(self.date_dir, fname)]
     for fpath in options:
         if isfile(fpath):
             return fpath
     if is_critical:
         critical('Log file not found as ' + ', '.join(options))
     elif not silent:
         err('Log file not found as ' + ', '.join(options))
Example #32
def secondary_conda_env(env_name='pcgr', is_critical=False):
    py_path = sys.executable  # e.g. /miniconda/envs/umccrise/bin/python
    env_path = dirname(dirname(py_path))  # e.g. /miniconda/envs/umccrise
    env_path = env_path + '_' + env_name  # e.g. /miniconda/envs/umccrise_pcgr
    if not isdir(env_path):
        if is_critical:
            critical(f'Can\'t find environment {env_path}')
            return None
    return env_path
Example #33
 def find_in_log(self, fname, is_critical=False, silent=True):
     options = [join(self.log_dir, fname),
                join(self.date_dir, fname)]
     for fpath in options:
         if isfile(fpath):
             return fpath
     if is_critical:
         critical('Log file not found as ' + ', '.join(options))
     elif not silent:
         err('Log file not found as ' + ', '.join(options))
def _translate_from_start_codon(seq, to_stop, name):
    """ Seq must start with START. Translates until STOP.
    codon_table = CodonTable.unambiguous_dna_by_name['Standard']
    if str(seq[:3]).upper() not in codon_table.start_codons:
        logger.critical(name + ' expected to start with a START codon: ' +
    pep_5p = _trim3(seq).translate(to_stop=to_stop)
    # for the case if the peptide starts with an alternative start codon, replace it with M
    return 'M' + pep_5p[1:]
Example #35
def sort_bed(input_bed_fpath,
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
                'Either of chr_order, fai_fpath, or genome build name must be specified'
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                    if l.strip().startswith('#'):

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
    return output_bed_fpath
Example #36
def get_bed_genes(bed_fpath):
    """ Returns a list from the 4th column of a bed file
    try:  # to allow optional pybedtools
        from ngs_utils.bed_utils import get_genes_from_bed
    except ImportError:
        critical('Please, install pybedtools (conda install -c bioconda -y pybedtools)')
        gene_set, gene_list = get_genes_from_bed(bed_fpath)
        return [gn for gn in gene_list if (gn and gn != '.')]
Example #37
def is_small_target(bed_file=None):
    try:  # to allow optional pybedtools
        from ngs_utils.bed_utils import get_total_bed_size
    except ImportError:
            'Please, install pybedtools (conda install -c bioconda -y pybedtools)'
        return bed_file and isfile(
            bed_file) and get_total_bed_size(bed_file) < 10 * 1000 * 1000
Example #38
def get_read_group_sample_name(bam_fp):
    bam = pysam.AlignmentFile(bam_fp)
    header = bam.header.to_dict()
    samples = {rg['SM'] for rg in header.get('RG', list())}
    if len(samples) == 0:
            f'could not retrieve sample name from the @RG SM field in {bam_fp}'
    elif len(samples) > 1:
        critical('found more than one sample name in the @RG SM fields for '
                 f'{bam_fp}: {", ".join(samples)}')
    return samples.pop()
Example #39
def get_bed_genes(bed_fpath):
    """ Returns a list from the 4th column of a bed file
    try:  # to allow optional pybedtools
        from ngs_utils.bed_utils import get_genes_from_bed
    except ImportError:
            'Please, install pybedtools (conda install -c bioconda -y pybedtools)'
        gene_set, gene_list = get_genes_from_bed(bed_fpath)
        return [gn for gn in gene_list if (gn and gn != '.')]
Example #40
 def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False):
     if final_dir:
         return final_dir
     elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']:
         final_dirname = bcbio_cnf['upload']['dir']
         final_dir = adjust_path(join(config_dir, final_dirname))
         if create_dir: safe_mkdir(final_dir)
         verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True)
         final_dir = abspath(join(config_dir, pardir, 'final'))
         if create_dir: safe_mkdir(final_dir)
         if not verify_dir(final_dir):
             critical('If final directory it is not named "final", please, specify it in the bcbio config.')
     return final_dir
Example #41
 def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False):
     if final_dir:
         return final_dir
     elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']:
         final_dirname = bcbio_cnf['upload']['dir']
         final_dir = adjust_path(join(config_dir, final_dirname))
         if create_dir: safe_mkdir(final_dir)
         verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True)
         final_dir = abspath(join(config_dir, pardir, 'final'))
         if create_dir: safe_mkdir(final_dir)
         if not verify_dir(final_dir):
             critical('If final directory it is not named "final", please, specify it in the bcbio config.')
     return final_dir
Example #42
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    bed_annotation = which('annotate_bed.py')
    if not bed_annotation:
        bed_annotation = which('bed_annotation')
        critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.')

    cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Example #43
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Example #44
def calc_bases_within_threshs(bases_by_depth, total_size, depth_thresholds):
    bases_within_threshs = OrderedDict((depth, 0) for depth in depth_thresholds)
    rates_within_threshs = OrderedDict((depth, None) for depth in depth_thresholds)

    for depth, bases in bases_by_depth.items():
        for t in depth_thresholds:
            if depth >= t:
                bases_within_threshs[t] += bases
    for t in depth_thresholds:
        bs = bases_within_threshs[t]
        if total_size > 0:
            rate = 1.0 * bases_within_threshs[t] / total_size
            if rate > 1:
                critical('Error: rate is > 1: rate = ' + str(rate) + ', bases = ' + str(bs) + ', size = ' + str(total_size))
            rates_within_threshs[t] = rate

    return bases_within_threshs, rates_within_threshs
Example #45
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
            critical('Either of chr_order, fai_fpath, or genome build name must be specified')
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                    if l.strip().startswith('#'):

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath)
    return output_bed_fpath
Example #46
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Example #47
def main():
    options = [
        (['-g', '--genome'],
             help='Genome build. Accepted values: ' +
             ', '.join(ebl.SUPPORTED_GENOMES),
        (['-c', '--canonical'],
             help='Use canonical only',
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
            'Error: please, specify genome build name with -g (e.g. `-g hg19`)'
    genome = opts.genome

    logger.debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        logger.critical('Genome ' + genome + ' is not supported. Supported: ' +
                        ', '.join(ebl.SUPPORTED_GENOMES))

    logger.warn('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(
        lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    if opts.canonical:
        features_bed = features_bed.filter(

    logger.warn('Saving CDS regions...')
    output_fpath = adjust_path(
        join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
    logger.warn('Done, saved to ' + output_fpath)
Example #48
def main():
    parser = OptionParser(usage='Usage: ' + basename(__file__) + ' -o Output_BED_file -g hg19 Input_BED_file')
    parser.add_option('-o', '--output-bed', dest='output_fpath')
    parser.add_option('-g', '--genome', dest='genome')
    parser.add_option('--fai', dest='fai_fpath')
    (opts, args) = parser.parse_args(sys.argv[1:])

    if len(args) < 1:

    if not opts.output_fpath:

    sort_bed(input_bed_fpath=verify_bed(args[0], is_critical=True),
    def calc_genomic_bp_pos(self):
        genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord(
            self.trx, self.bp_offset)
        if genomic_coord is None:
            logger.critical(f'  Error: could not convert transcript {id} '
                            f'offset {genomic_coord} to genomic coordinate')
            return False

        if genomic_coord == -1:
                f'  Fusion in takes the entire transcript {self.trx.id} '
                f'(genomic_coord={genomic_coord}, bp_offset={self.bp_offset}). '
                f'That\'s suspicious, so we are skipping it.')
            return False

        self.bp_genomic_pos = genomic_coord
        self.bp_is_in_intron = is_in_intron
        return True
Example #50
def read_samples(args):
    bam_by_sample = find_bams(args)
    if bam_by_sample:
        info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else ''))

    input_not_bam = [verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample]
    input_not_bam = [fpath for fpath in input_not_bam if fpath]
    fastqs_by_sample = dict()
    if not input_not_bam and not bam_by_sample:
        critical('No correct input files')
    if input_not_bam:
        info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files')
        fastqs_by_sample = find_fastq_pairs(input_not_bam)
        if fastqs_by_sample:
            info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs')
        intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys())
        if intersection:
            critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection)))

    return fastqs_by_sample, bam_by_sample
Example #51
def make_cluster_cmdl(log_dir, app_name=''):
    """ Generates cluster command line parameters for snakemake
    from hpc_utils import hpc
    if not hpc.cluster_cmd:
        logger.critical(f'Automatic cluster submission is not supported for the machine "{hpc.name or hpc.hostname}"')

    cluster_submitter = get_submit_script()
    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    cluster_cmdl = f' --cluster "{cluster_submitter} {timestamp} {log_dir} {app_name}"'

    # Also overriding jobscript?
    jobscript = hpc.cluster_jobscript
    if jobscript:
        jobscript_file = join(log_dir, 'jobscript.sh')
        with open(jobscript_file, 'w') as f_out:
            f_out.write(jobscript.replace('{path}', os.environ["PATH"]))
        cluster_cmdl += f' --jobscript "{jobscript_file}"'

    return cluster_cmdl
Example #52
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx)

    return output_bed_fpath
Example #53
def load_bcbio_cnf(config_dir, silent=False):
    all_yamls = [
        abspath(join(config_dir, fname))
        for fname in listdir(config_dir)
        if fname.endswith('.yaml')]
    if len(all_yamls) == 0:
        critical('No YAML file in the config directory.')

    bcbio_yamls = []
    for fpath in all_yamls:
        if not fpath.endswith('-template.yaml'):
            if 'details' in load_yaml_config(fpath):
    if len(bcbio_yamls) == 0:
        critical('No bcbio YAMLs found in the config directory: ' + config_dir +
                 ' (only ' + ', '.join(map(basename, all_yamls)) +
                 ' which do not have the "details" section)')
    if len(bcbio_yamls) > 1:
        critical('More than one bcbio YAML file found in the config directory ' +
                 config_dir + ': ' + ' '.join(bcbio_yamls))
    yaml_fpath = bcbio_yamls[0]
    if not silent: info('Using bcbio YAML config: ' + yaml_fpath)
    return load_yaml_config(yaml_fpath), yaml_fpath
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num,
              high_confidence=False, reannotate=False, is_debug=False, **kwargs):
    # if genome:
        # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2'))
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')')
        # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0])
    # else:

    intersection_bed = None
    intersection_fpath = None
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    if is_debug:
        intersection_fpath = join(work_dir, 'intersection.bed')
        if isfile(intersection_fpath):
            info('Loading from ' + intersection_fpath)
            intersection_bed = BedTool(intersection_fpath)
    if not intersection_bed:
        if count_bed_cols(fai_fpath) == 2:
            debug('Fai fields size is 2 ' + fai_fpath)
            intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath)
            debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2')
            intersection_bed = bed.intersect(ref_bed, wao=True)
    if is_debug and not isfile(intersection_fpath):
        debug('Saved intersection to ' + intersection_fpath)

    total_annotated = 0
    total_uniq_annotated = 0
    total_off_target = 0

    met = set()

    overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list)))
    # off_targets = list()

    expected_fields_num = ori_col_num + len(ba.BedCols.cols[:-4]) + 1
    for i, intersection_fields in enumerate(intersection_bed):
        inters_fields_list = list(intersection_fields)
        if len(inters_fields_list) < expected_fields_num:
                f'Cannot parse the reference BED file - unexpected number of lines '
                f'({len(inters_fields_list)} in {inters_fields_list} (less than {expected_fields_num})')

        a_chr, a_start, a_end = intersection_fields[:3]
        a_extra_columns = intersection_fields[3:ori_col_num]

        overlap_fields = [None for _ in ba.BedCols.cols]

        overlap_fields[:len(intersection_fields[ori_col_num:])] = intersection_fields[ori_col_num:]
        keep_gene_column = not reannotate
        a_gene = None
        if keep_gene_column:
            a_gene = a_extra_columns[0]

        e_chr = overlap_fields[0]
        overlap_size = int(intersection_fields[-1])
        assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}'

        # fs = [None for _ in ebl.BedCols.cols]
        # fs[:3] = [a_chr, a_start, a_end]
        reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns))

        if e_chr == '.':
            total_off_target += 1
            # off_targets.append(fs)
            overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)

            # fs[3:-1] = db_feature_fields[3:-1]
            total_annotated += 1
            if (a_chr, a_start, a_end) not in met:
                total_uniq_annotated += 1
                met.add((a_chr, a_start, a_end))

            e_gene = overlap_fields[ba.BedCols.GENE] if not high_confidence else overlap_fields[ba.BedCols.HUGO]
            if keep_gene_column and e_gene != a_gene:
                overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list)
                transcript_id = overlap_fields[ba.BedCols.ENSEMBL_ID]
                overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size))

    info('  Total annotated regions: ' + str(total_annotated))
    info('  Total unique annotated regions: ' + str(total_uniq_annotated))
    info('  Total off target regions: ' + str(total_off_target))
    info('Resolving ambiguities...')
    annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs)

    return annotated
def _proc_ucsc(inp, output_fpath, chr_order):  #, approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym):
    gene_by_name_and_chrom = dict()

    for l in inp:
        if l and not l.startswith('#'):
            fs = l.replace('\n', '').split('\t')
            txStart, txEnd = None, None
            if len(fs) > 9:
                _, transcript_id, ucsc_chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, \
                        _,  gene_symbol = fs[:13]
                txStart, txEnd = int(txStart), int(txEnd)
                transcript_id, ucsc_chrom, strand, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, gene_symbol =\
                    l.replace('\n', '').split('\t')
            cdsStart = int(cdsStart)
            cdsEnd = int(cdsEnd)
            exonCount = int(exonCount)
            exonStarts = [int(v) + 1 for v in exonStarts.split(',') if v]
            exonEnds = map(int, filter(None, exonEnds.split(',')))

            # if ucsc_chrom != prev_chrom:  # RefGene is not sorted
            #     info(ucsc_chrom)
            #     prev_chrom = ucsc_chrom

            # approved_gene_symbol, status = get_approved_gene_symbol(
            #     approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym,
            #     gene_symbol, ucsc_id, ucsc_chrom)
            # if not approved_gene_symbol:
            #     not_approved_gene_names.append(gene_symbol + '\t' + status)
            #     if DO_APPROVE:
            #         continue
            #     else:
            #         approved_gene_symbol = gene_symbol

            txStart = txStart or exonStarts[0] - 1
            txEnd = txEnd or exonEnds[exonCount - 1]

            # out.write('\t'.join([ucsc_chrom, str(min(txStart, cdsStart)), str(max(txEnd, cdsEnd)),
            #                      gene_symbol, '.', strand, 'Gene', '.']) + '\n')

            assert txStart <= cdsStart, l
            assert txEnd >= cdsEnd, l

            if (gene_symbol, ucsc_chrom) not in gene_by_name_and_chrom:
                gene = Gene(ucsc_chrom, chr_order.get(ucsc_chrom), gene_symbol, strand)
                gene_by_name_and_chrom[(gene_symbol, ucsc_chrom)] = gene
            gene = gene_by_name_and_chrom[(gene_symbol, ucsc_chrom)]

            transcript = Transcript(gene, transcript_id, txStart, txEnd, strand)  # one line - one transcript
            if transcript_id.startswith('NR_'):
                transcript.coding = False
                transcript.biotype = 'RNA'
            elif transcript_id.startswith('NM_'):
                transcript.coding = True
                transcript.biotype = 'protein_coding'
                critical('Unknown transcript ID prefix ' + transcript_id.split('_')[0] + ' in ' + transcript_id)

            r'''            cdsStart    cdsEnd      exonsCount  exonStarts      exonsEnds
NM_001303242	chr1	+	150981108	151006710	7           150980866,150990287,150990942,150997086,150997990,150999708,151006281,150981147,150990380,150991145,150997271,150998149,150999803,151008189,	PRUNE
NM_021222	    chr1	+	150981108	151006710	8	        150980866,150990287,150990942,150997086,150997990,150999708,151001261,151006281,	150981147,150990380,150991145,150997271,150998149,150999803,151001420,151008189,	PRUNE
NM_001303243	chr1	+	150991069	151006710	6	        150980866,150990287,150990942,150999708,151001261,151006281,	150981147,150990380,150991145,150999803,151001420,151008189,	PRUNE
NM_001303229	chr1	+	150998016	151006710	7	        150980866,150990287,150997086,150997990,150999708,151001261,151006281,	150981147,150990380,150997271,150998149,150999803,151001420,151008189,	PRUNE
NR_130132	    chr1	+	151008189	151008189	4	        150980866,150990287,150999708,151006281,	150981147,150990380,150999803,151008189,	PRUNE
NR_130131	    chr1	+	151008189	151008189	5	        150980866,150990287,150999708,151001261,151006281,	150981147,150990380,150999803,151001420,151008189,	PRUNE
NR_130130	    chr1	+	151008189	151008189	4	        150980866,150997990,150999708,151006281,	150981147,150998149,150999803,151008189,	PRUNE
NR_130135	    chr1	+	151008189	151008189	5	        150980866,150990287,150997990,150999708,151006281,	150981147,150990380,150998149,150999803,151008189,	PRUNE

             exonSt    cdsSt     exonEnd exonSt    exonEnd     exonSt    cdsEnd    exonEnd  ncRNA: CDS is empty
NM_001303242 0980866/ *0981108* \0981147 0990287/ \0990380 ... 1006281/ *1006710* \1008189
NM_001303243 0980866/ *0991069* \0981147 0990287/ \0990380 ... 1006281/ *1006710* \1008189
NM_001303229 0980866/ *0998016* \0981147 0990287/ \0990380 ... 1006281/ *1006710* \1008189
NR_130132	 0980866/           \0981147 0990287/ \0990380 ... 1006281/           \1008189 *1008189* *1008189*
            for exon_number, eStart, eEnd in zip(
                        [s for s in exonStarts if s],
                        [e for e in exonEnds if e]):
                eStart -= 1

                if eEnd <= cdsStart or eStart > cdsEnd:
                    biotype = 'UTR' if transcript.coding else transcript.biotype
                    transcript.exons.append(Exon(transcript, eStart, eEnd, 'Exon', exon_number, biotype))

                    assert transcript.coding, 'Non-coding NM_ transcript ' + transcript_id

                    if eStart < cdsStart:
                        transcript.exons.append(Exon(transcript, eStart, cdsStart, 'Exon', exon_number, biotype='UTR'))
                    if eEnd > cdsEnd:
                        transcript.exons.append(Exon(transcript, cdsEnd, eEnd, 'Exon', exon_number, biotype='UTR'))
                    transcript.exons.append(Exon(transcript, max(cdsStart, eStart), min(cdsEnd, eEnd), 'CDS', exon_number))

    return gene_by_name_and_chrom
Example #56
def get_executable():
    sys_path = which('sambamba')
    if not sys_path:
        critical('Error: sambamba executable is not found')
    return sys_path
Example #57
def check_genome(genome):
    if genome not in SUPPORTED_GENOMES:
        critical('Genome ' + str(genome) + ' is not supported. Supported genomes: ' + ', '.join(SUPPORTED_GENOMES))
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ba.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)

    if high_confidence:
        features_bed = features_bed.filter(ba.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ba.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ba.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
                                       x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:

            if extended:
                out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath