def find_bams(args): bam_by_sample = OrderedDict() bad_bam_fpaths = [] good_args = [] for arg in args: # /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0159_AHK2KTADXX/bcbio,Kudos159 /ngs/oncology/Analysis/bioscience/Bio_0038_KudosCellLinesExomes/Bio_0038_150521_D00443_0160_BHKWMNADXX/bcbio,Kudos160 fpath = arg.split(',')[0] fname, ext = splitext(fpath) if ext == '.bam': bam_fpath = verify_bam(fpath) if not bam_fpath: bad_bam_fpaths.append(fpath) else: if len(arg.split(',')) > 1: sname = arg.split(',')[1] else: sname = basename(splitext(bam_fpath)[0]) bam_by_sample[sname] = bam_fpath good_args.append(arg) if bad_bam_fpaths: critical('BAM files cannot be found, empty or not BAMs: ' + ', '.join(bad_bam_fpaths)) for arg in good_args: args.remove(arg) return bam_by_sample
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [ verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample ] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}' if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def safe_mkdir(dirpath, descriptive_name=''): """ Multiprocessing-safely and recursively creates a directory """ if not dirpath: critical(f'Path is empty: {descriptive_name if descriptive_name else ""}') if isdir(dirpath): return dirpath if isfile(dirpath): critical(descriptive_name + ' ' + dirpath + ' is a file.') num_tries = 0 max_tries = 10 while not exists(dirpath): # we could get an error here if multiple processes are creating # the directory at the same time. Grr, concurrency. try: os.makedirs(dirpath) except OSError as e: if num_tries > max_tries: raise num_tries += 1 time.sleep(2) return dirpath
def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False): if not date_dir: fc_date = bcbio_cnf.get('fc_date') fc_name = bcbio_cnf.get('fc_name') or 'project' if fc_date: # Date dirpath is from bcbio and named after fc_name, not our own project name date_dir = join(final_dir, fc_date + '_' + fc_name) if not create_dir and not verify_dir(date_dir, silent=True): critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}') else: if isdir(join(final_dir, 'project')): # bcbio-CWL? date_dir = join(final_dir, 'project') if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir) else: regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}'] date_dirs = [join(final_dir, dirpath) for dirpath in listdir(final_dir) if any(re.match(regex, dirpath) for regex in regexs)] if len(date_dirs) == 0: raise NoDateStampsException('Error: no datestamp directory!') elif len(date_dirs) == 1: date_dir = date_dirs[0] else: dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs] newest_date, newest_dir = sorted(dates, reverse=True)[0] newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir] if len(newest_dirs) > 1: raise MultipleDateStampsException(f'Error: multiple datestamp directory found, ' f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}') date_dir = newest_dirs[0] if not silent: info('Using the datestamp dir: ' + date_dir) if create_dir: safe_mkdir(date_dir) return date_dir
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def extract_features(output_file, genome, only_canonical, high_confidence, coding_only, feature_types): """ For debug purposes """ debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') feature_types = feature_types or ['exon', 'CDS', 'stop_codon', 'transcript'] features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in feature_types) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') features_bed.saveas(output_file) debug(f'Saved features to {output_file}')
def get_or_create_run(projects, parall_view=None): genomes = set([p.genome for p in projects]) if len(genomes) > 1: log.critical('Error: multiple genomes in projects: ' + str(genomes)) run = Run.find_by_projects(projects) if run and run.rerun_on_usercall: log.info() log.info('Rebuilding tree on usercall') build_tree(run) run.rerun_on_usercall = False db.session.commit() return run if run and not Run.is_ready(run): log.debug('Tree files do not exist, recreating run for projects ' + ', '.join(p.name for p in projects)) db.session.delete(run) db.session.commit() run = None if run: log.debug('Found run for ' + ', '.join([p.name for p in projects]) + ' with ID ' + str(run.id)) else: log.debug('Creating new run for projects ' + ', '.join(p.name for p in projects)) run = Run.create(projects, parall_view) log.debug('Done creating new run with ID ' + str(run.id)) return run
def tx_tmpdir(base_dir, rollback_dirpath): """Context manager to create and remove a transactional temporary directory. """ # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4())) # unique_attempts = 0 # while os.path.exists(tmp_dir_base): # if unique_attempts > 5: # break # tmp_dir_base = join(base_dir, 'tx', str(uuid.uuid4())) # time.sleep(1) # unique_attempts += 1 # if base_dir is not None: # tmp_dir_base = os.path.join(base_dir, "tx") # else: # tmp_dir_base = os.path.join(os.getcwd(), "tx") if exists(rollback_dirpath): critical(rollback_dirpath + ' already exists') tmp_dir = tempfile.mkdtemp(dir=base_dir) safe_mkdir(tmp_dir) try: yield tmp_dir finally: if tmp_dir and exists(tmp_dir): os.rename(tmp_dir, rollback_dirpath)
def get_ref_fasta(genome): if is_az(): path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa' if isfile(path): logger.info('Found genome fasta at ' + path) return path if isdir(join(DATA_DIR, 'genomes', genome)): genome_dir = safe_mkdir(join(DATA_DIR, 'genomes')) else: genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes')) if genome not in genomepy.list_installed_genomes(genome_dir): genome_rec = [ rec for rec in genomepy.list_available_genomes() if rec[1] == genome ] if genome_rec: genome_rec = genome_rec[0] else: logger.critical('Error: genome ' + genome + ' is not available') logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] + ' and installing into ' + genome_dir) genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir) genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename return genome_fasta_file
def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): if not silent: critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.bcbio_project.final_dir}. Please check consistency between the YAML ' f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') else: return False self.var_dirpath = join(self.dirpath, BcbioProject.var_dir) self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_files(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files') return True
def update_batches(samples, silent=False): batch_by_name = {bn: Batch(bn) for bn in list(set([b for s in samples for b in s.batch_names]))} for sample in samples: for bn in sample.batch_names: batch_by_name[bn].name = bn sample.batch = batch_by_name[bn] if sample.phenotype == 'normal': if batch_by_name[bn].normal: critical('Multiple normal samples for batch ' + bn) batch_by_name[bn].normal = sample else: batch_by_name[bn].tumor = sample for batch in batch_by_name.values(): if batch.normal and not batch.tumor: if not silent: info('Batch ' + batch.name + ' contains only normal, treating sample ' + batch.normal.name + ' as tumor') batch.normal.phenotype = 'tumor' batch.normal.batch = batch batch.tumor = batch.normal batch.normal = None # setting up batch properties for b in batch_by_name.values(): b.tumor.normal_match = b.normal return batch_by_name
def make_cluster_cmdl(log_dir, refdata, app_name, cluster_submit_cmd=None): """ Generates cluster command line parameters for snakemake """ if not cluster_submit_cmd and not refdata.cluster_cmd: logger.critical(f'Automatic cluster submission ' f'is not supported for the machine "{refdata.name}". ' f'Use exclicit --cluster-cmd') if not cluster_submit_cmd: cluster_submit_cmd = refdata.cluster_cmd # Replacing the curly braces to avoid confusing snakemake formatter which for some reason triggers cluster_submit_cmd = cluster_submit_cmd.replace('{', '[').replace('}', ']') cluster_submitter = get_submit_script() timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') from reference_data import api as refdata cluster_cmdl = \ f' --cluster "{cluster_submitter} {timestamp} {log_dir} {app_name} ' \ f'\'{cluster_submit_cmd}\'"' # Also overriding jobscript? jobscript = refdata.cluster_jobscript if jobscript: safe_mkdir(log_dir) jobscript_file = join(log_dir, 'jobscript.sh') with open(jobscript_file, 'w') as f_out: f_out.write(jobscript.replace('{path}', os.environ["PATH"])) cluster_cmdl += f' --jobscript "{jobscript_file}"' return cluster_cmdl
def _get(relative_path, genome=None): """ :param relative_path: relative path of the file inside the repository :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20, in case of BED, the returning BedTool will be with added filter. :return: BedTools object if it's a BED file, or filepath """ chrom = None if genome: if '-chr' in genome: genome, chrom = genome.split('-') check_genome(genome) relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if not isfile(path) and isfile(path + '.gz'): path += '.gz' if path.endswith('.bed') or path.endswith('.bed.gz'): if path.endswith('.bed.gz'): bedtools = which('bedtools') if not bedtools: critical('bedtools not found in PATH: ' + str(os.environ['PATH'])) debug('BED is compressed, creating BedTool') bed = BedTool(path) else: debug('BED is uncompressed, creating BedTool') bed = BedTool(path) if chrom: debug('Filtering BEDTool for chrom ' + chrom) bed = bed.filter(lambda r: r.chrom == chrom) return bed else: return path
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.rgid = self.name self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.parent_project.final_dir}. Please check consistency between the YAML ' f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_callers(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files')
def cnv_to_bed(cnv_path, out_bed_path): with open(cnv_path) as fh: parse_fn = None header = next(fh).strip().split('\t') if header[0].startswith('##fileformat=VCF'): # Manta info(f'Detected {cnv_path} as caller "manta"') parse_fn = iter_manta else: for caller, hdr in header_by_caller.items(): if header == hdr: print( f'Parsing {cnv_path} as caller "{caller}" with header {hdr}' ) parse_fn = get_iter_cnv(header, parse_row_by_caller[caller]) if not parse_fn: critical(f'Cannot detect CNV file format in {cnv_path}') with open(out_bed_path, 'w') as out: writer = csv.writer(out, delimiter='\t') for i, call in enumerate(parse_fn(cnv_path)): if call: bed_row = call.get_bed_raw() writer.writerow(bed_row) if i == 0: print(bed_row) print('')
def lift_over(fpath, from_genome, to_genome): chain_file = join(dirname(__file__), 'over.chain', f'{from_genome}To{to_genome.title()}.over.chain.gz') if not verify_file(chain_file): log.critical(f'Error: conversion from {from_genome} to {to_genome} is not supported!') out_fpath = add_suffix(fpath, to_genome) call_process.run(f'liftOver {fpath} {chain_file} {out_fpath} {out_fpath}.unMapped') return out_fpath
def pair_dragen_directories(paths): # DRAGEN tumor/normal and normal directories are paired on the basis of the normal sample # name. # # Tumor and normal sample names are extracted from the BAM header. Specifically, the BAM # sample name is retrieved from the 'SM' (sample) field of the '@RG' (read group) header line. # # Tumor or normal identity of a sample is inferred from the BAM filename: if a BAM filename # contains the '_tumor.bam' suffix then it and the sample name is set as the tumor, otherwise # set as the normal sample. # # The subject identifier is from the DRAGEN output directory name. # # Assumes a one-to-one pairing for DRAGEN tumor/normal and normal output directories i.e. no # multiple tumor/normal runs to a single normal run. # Sort paths by normal sample name so that normal and tumor/normal are placed together paths_sorted = dict() for path in paths: dir_type = 'tumor_normal_run' if is_dragen_tumor_normal_directory( path) else 'normal_run' samples = get_samples_from_dragen_dir_bams(path) # Ensure we have found normal names if 'normal' not in samples: critical( f'Could not find normal sample name for DRAGEN directory {path}' ) # Sort by normal sample name, add path, subject ID to stored data sample_normal = samples['normal'] if sample_normal not in paths_sorted: paths_sorted[sample_normal] = dict() assert dir_type not in paths_sorted[sample_normal] paths_sorted[sample_normal][dir_type] = samples paths_sorted[sample_normal][dir_type]['path'] = path paths_sorted[sample_normal][dir_type][ 'prefix'] = get_dragen_output_prefix(path) paths_sorted[sample_normal][dir_type][ 'subject_id'] = get_subject_id_from_dragen_dir(path) # Differentiated paired and unpaired paths paths_unpaired = list() paths_paired = list() for paths in paths_sorted.values(): if 'normal_run' in paths and 'tumor_normal_run' in paths: # Ensure we have collected only one subject id for this set of inputs assert len({d['subject_id'] for d in paths.values()}) == 1 paths['subject_id'] = paths['normal_run']['subject_id'] paths_paired.append(paths) else: for dir_type, data in paths.items(): paths_unpaired.append((dir_type, data['path'])) # Emit warning for unpaired paths if paths_unpaired: paths_unpaired_strs = list() for dir_type, path in paths_unpaired: paths_unpaired_strs.append(f'{dir_type}: {path}') paths_unpaired_str = '\n\t'.join(paths_unpaired_strs) warn(f'could not pair DRAGEN directories:\n\t{paths_unpaired_str}') return paths_paired
def is_small_target(bed_file=None): try: # to allow optional pybedtools from ngs_utils.bed_utils import get_total_bed_size except ImportError: print_exc() critical('Please, install pybedtools (conda install -c bioconda -y pybedtools)') else: return bed_file and isfile(bed_file) and get_total_bed_size(bed_file) < 10 * 1000 * 1000
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def calc_genomic_bp_offset(self): genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord( self.trx, self.bp_offset) if genomic_coord is None: logger.critical( f' Error: could not convert transcript {id} offest {genomic_coord} to genomic coordinate' ) return None return genomic_coord, is_in_intron
def get_dragen_output_prefix(dirpath): for fp in dirpath.iterdir(): if not fp.match('*replay.json'): continue return fp.name.replace('-replay.json', '') else: critical( 'could not determine output prefix for DRAGEN directory \'{dirpath}\'' )
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath)) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = _load_yaml(fpath) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def find_in_log(self, fname, is_critical=False, silent=True): options = [join(self.log_dir, fname), join(self.date_dir, fname)] for fpath in options: if isfile(fpath): return fpath if is_critical: critical('Log file not found as ' + ', '.join(options)) elif not silent: err('Log file not found as ' + ', '.join(options))
def secondary_conda_env(env_name='pcgr', is_critical=False): py_path = sys.executable # e.g. /miniconda/envs/umccrise/bin/python env_path = dirname(dirname(py_path)) # e.g. /miniconda/envs/umccrise env_path = env_path + '_' + env_name # e.g. /miniconda/envs/umccrise_pcgr if not isdir(env_path): if is_critical: critical(f'Can\'t find environment {env_path}') else: return None return env_path
def _translate_from_start_codon(seq, to_stop, name): """ Seq must start with START. Translates until STOP. """ codon_table = CodonTable.unambiguous_dna_by_name['Standard'] if str(seq[:3]).upper() not in codon_table.start_codons: logger.critical(name + ' expected to start with a START codon: ' + seq[:3]) pep_5p = _trim3(seq).translate(to_stop=to_stop) # for the case if the peptide starts with an alternative start codon, replace it with M return 'M' + pep_5p[1:]
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def get_bed_genes(bed_fpath): """ Returns a list from the 4th column of a bed file """ try: # to allow optional pybedtools from ngs_utils.bed_utils import get_genes_from_bed except ImportError: print_exc() critical('Please, install pybedtools (conda install -c bioconda -y pybedtools)') else: gene_set, gene_list = get_genes_from_bed(bed_fpath) return [gn for gn in gene_list if (gn and gn != '.')]
def is_small_target(bed_file=None): try: # to allow optional pybedtools from ngs_utils.bed_utils import get_total_bed_size except ImportError: print_exc() critical( 'Please, install pybedtools (conda install -c bioconda -y pybedtools)' ) else: return bed_file and isfile( bed_file) and get_total_bed_size(bed_file) < 10 * 1000 * 1000
def get_read_group_sample_name(bam_fp): bam = pysam.AlignmentFile(bam_fp) header = bam.header.to_dict() samples = {rg['SM'] for rg in header.get('RG', list())} if len(samples) == 0: critical( f'could not retrieve sample name from the @RG SM field in {bam_fp}' ) elif len(samples) > 1: critical('found more than one sample name in the @RG SM fields for ' f'{bam_fp}: {", ".join(samples)}') return samples.pop()
def get_bed_genes(bed_fpath): """ Returns a list from the 4th column of a bed file """ try: # to allow optional pybedtools from ngs_utils.bed_utils import get_genes_from_bed except ImportError: print_exc() critical( 'Please, install pybedtools (conda install -c bioconda -y pybedtools)' ) else: gene_set, gene_list = get_genes_from_bed(bed_fpath) return [gn for gn in gene_list if (gn and gn != '.')]
def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False): if final_dir: return final_dir elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']: final_dirname = bcbio_cnf['upload']['dir'] final_dir = adjust_path(join(config_dir, final_dirname)) if create_dir: safe_mkdir(final_dir) verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True) else: final_dir = abspath(join(config_dir, pardir, 'final')) if create_dir: safe_mkdir(final_dir) if not verify_dir(final_dir): critical('If final directory it is not named "final", please, specify it in the bcbio config.') return final_dir
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.') cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def calc_bases_within_threshs(bases_by_depth, total_size, depth_thresholds): bases_within_threshs = OrderedDict((depth, 0) for depth in depth_thresholds) rates_within_threshs = OrderedDict((depth, None) for depth in depth_thresholds) for depth, bases in bases_by_depth.items(): for t in depth_thresholds: if depth >= t: bases_within_threshs[t] += bases for t in depth_thresholds: bs = bases_within_threshs[t] if total_size > 0: rate = 1.0 * bases_within_threshs[t] / total_size if rate > 1: critical('Error: rate is > 1: rate = ' + str(rate) + ', bases = ' + str(bs) + ', size = ' + str(total_size)) rates_within_threshs[t] = rate return bases_within_threshs, rates_within_threshs
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of chr_order, fai_fpath, or genome build name must be specified') chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append(Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), (['-c', '--canonical'], dict( dest='canonical', action='store_true', help='Use canonical only', )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: logger.critical( 'Error: please, specify genome build name with -g (e.g. `-g hg19`)' ) genome = opts.genome logger.debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: logger.critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) logger.warn('Extracting features from Ensembl GTF') features_bed = features_bed.filter( lambda x: x[ebl.BedCols.FEATURE] == 'CDS') if opts.canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) logger.warn('Saving CDS regions...') output_fpath = adjust_path( join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) logger.warn('Done, saved to ' + output_fpath)
def main(): parser = OptionParser(usage='Usage: ' + basename(__file__) + ' -o Output_BED_file -g hg19 Input_BED_file') parser.add_option('-o', '--output-bed', dest='output_fpath') parser.add_option('-g', '--genome', dest='genome') parser.add_option('--fai', dest='fai_fpath') (opts, args) = parser.parse_args(sys.argv[1:]) if len(args) < 1: parser.print_help(file=sys.stderr) sys.exit(1) if not opts.output_fpath: critical(parser.usage) sort_bed(input_bed_fpath=verify_bed(args[0], is_critical=True), output_bed_fpath=adjust_path(opts.output_fpath), fai_fpath=adjust_path(opts.fai_fpath), genome=opts.genome)
def calc_genomic_bp_pos(self): genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord( self.trx, self.bp_offset) if genomic_coord is None: logger.critical(f' Error: could not convert transcript {id} ' f'offset {genomic_coord} to genomic coordinate') return False if genomic_coord == -1: logger.debug( f' Fusion in takes the entire transcript {self.trx.id} ' f'(genomic_coord={genomic_coord}, bp_offset={self.bp_offset}). ' f'That\'s suspicious, so we are skipping it.') return False self.bp_genomic_pos = genomic_coord self.bp_is_in_intron = is_in_intron return True
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def make_cluster_cmdl(log_dir, app_name=''): """ Generates cluster command line parameters for snakemake """ from hpc_utils import hpc if not hpc.cluster_cmd: logger.critical(f'Automatic cluster submission is not supported for the machine "{hpc.name or hpc.hostname}"') cluster_submitter = get_submit_script() timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') cluster_cmdl = f' --cluster "{cluster_submitter} {timestamp} {log_dir} {app_name}"' # Also overriding jobscript? jobscript = hpc.cluster_jobscript if jobscript: jobscript_file = join(log_dir, 'jobscript.sh') with open(jobscript_file, 'w') as f_out: f_out.write(jobscript.replace('{path}', os.environ["PATH"])) cluster_cmdl += f' --jobscript "{jobscript_file}"' return cluster_cmdl
def load_bcbio_cnf(config_dir, silent=False): all_yamls = [ abspath(join(config_dir, fname)) for fname in listdir(config_dir) if fname.endswith('.yaml')] if len(all_yamls) == 0: critical('No YAML file in the config directory.') bcbio_yamls = [] for fpath in all_yamls: if not fpath.endswith('-template.yaml'): if 'details' in load_yaml_config(fpath): bcbio_yamls.append(fpath) if len(bcbio_yamls) == 0: critical('No bcbio YAMLs found in the config directory: ' + config_dir + ' (only ' + ', '.join(map(basename, all_yamls)) + ' which do not have the "details" section)') if len(bcbio_yamls) > 1: critical('More than one bcbio YAML file found in the config directory ' + config_dir + ': ' + ' '.join(bcbio_yamls)) yaml_fpath = bcbio_yamls[0] if not silent: info('Using bcbio YAML config: ' + yaml_fpath) return load_yaml_config(yaml_fpath), yaml_fpath
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ba.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical( f'Cannot parse the reference BED file - unexpected number of lines ' f'({len(inters_fields_list)} in {inters_fields_list} (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ba.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:])] = intersection_fields[ori_col_num:] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ba.BedCols.GENE] if not high_confidence else overlap_fields[ba.BedCols.HUGO] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ba.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def _proc_ucsc(inp, output_fpath, chr_order): #, approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym): gene_by_name_and_chrom = dict() for l in inp: if l and not l.startswith('#'): fs = l.replace('\n', '').split('\t') txStart, txEnd = None, None if len(fs) > 9: _, transcript_id, ucsc_chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, \ _, gene_symbol = fs[:13] txStart, txEnd = int(txStart), int(txEnd) else: transcript_id, ucsc_chrom, strand, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, gene_symbol =\ l.replace('\n', '').split('\t') cdsStart = int(cdsStart) cdsEnd = int(cdsEnd) exonCount = int(exonCount) exonStarts = [int(v) + 1 for v in exonStarts.split(',') if v] exonEnds = map(int, filter(None, exonEnds.split(','))) # if ucsc_chrom != prev_chrom: # RefGene is not sorted # info(ucsc_chrom) # prev_chrom = ucsc_chrom # approved_gene_symbol, status = get_approved_gene_symbol( # approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym, # gene_symbol, ucsc_id, ucsc_chrom) # # if not approved_gene_symbol: # not_approved_gene_names.append(gene_symbol + '\t' + status) # if DO_APPROVE: # continue # else: # approved_gene_symbol = gene_symbol txStart = txStart or exonStarts[0] - 1 txEnd = txEnd or exonEnds[exonCount - 1] # out.write('\t'.join([ucsc_chrom, str(min(txStart, cdsStart)), str(max(txEnd, cdsEnd)), # gene_symbol, '.', strand, 'Gene', '.']) + '\n') assert txStart <= cdsStart, l assert txEnd >= cdsEnd, l if (gene_symbol, ucsc_chrom) not in gene_by_name_and_chrom: gene = Gene(ucsc_chrom, chr_order.get(ucsc_chrom), gene_symbol, strand) gene_by_name_and_chrom[(gene_symbol, ucsc_chrom)] = gene gene = gene_by_name_and_chrom[(gene_symbol, ucsc_chrom)] transcript = Transcript(gene, transcript_id, txStart, txEnd, strand) # one line - one transcript gene.transcripts.append(transcript) if transcript_id.startswith('NR_'): transcript.coding = False transcript.biotype = 'RNA' elif transcript_id.startswith('NM_'): transcript.coding = True transcript.biotype = 'protein_coding' else: critical('Unknown transcript ID prefix ' + transcript_id.split('_')[0] + ' in ' + transcript_id) r''' cdsStart cdsEnd exonsCount exonStarts exonsEnds NM_001303242 chr1 + 150981108 151006710 7 150980866,150990287,150990942,150997086,150997990,150999708,151006281,150981147,150990380,150991145,150997271,150998149,150999803,151008189, PRUNE NM_021222 chr1 + 150981108 151006710 8 150980866,150990287,150990942,150997086,150997990,150999708,151001261,151006281, 150981147,150990380,150991145,150997271,150998149,150999803,151001420,151008189, PRUNE NM_001303243 chr1 + 150991069 151006710 6 150980866,150990287,150990942,150999708,151001261,151006281, 150981147,150990380,150991145,150999803,151001420,151008189, PRUNE NM_001303229 chr1 + 150998016 151006710 7 150980866,150990287,150997086,150997990,150999708,151001261,151006281, 150981147,150990380,150997271,150998149,150999803,151001420,151008189, PRUNE NR_130132 chr1 + 151008189 151008189 4 150980866,150990287,150999708,151006281, 150981147,150990380,150999803,151008189, PRUNE NR_130131 chr1 + 151008189 151008189 5 150980866,150990287,150999708,151001261,151006281, 150981147,150990380,150999803,151001420,151008189, PRUNE NR_130130 chr1 + 151008189 151008189 4 150980866,150997990,150999708,151006281, 150981147,150998149,150999803,151008189, PRUNE NR_130135 chr1 + 151008189 151008189 5 150980866,150990287,150997990,150999708,151006281, 150981147,150990380,150998149,150999803,151008189, PRUNE exonSt cdsSt exonEnd exonSt exonEnd exonSt cdsEnd exonEnd ncRNA: CDS is empty NM_001303242 0980866/ *0981108* \0981147 0990287/ \0990380 ... 1006281/ *1006710* \1008189 NM_001303243 0980866/ *0991069* \0981147 0990287/ \0990380 ... 1006281/ *1006710* \1008189 NM_001303229 0980866/ *0998016* \0981147 0990287/ \0990380 ... 1006281/ *1006710* \1008189 NR_130132 0980866/ \0981147 0990287/ \0990380 ... 1006281/ \1008189 *1008189* *1008189* ''' for exon_number, eStart, eEnd in zip( range(exonCount), [s for s in exonStarts if s], [e for e in exonEnds if e]): eStart -= 1 if eEnd <= cdsStart or eStart > cdsEnd: biotype = 'UTR' if transcript.coding else transcript.biotype transcript.exons.append(Exon(transcript, eStart, eEnd, 'Exon', exon_number, biotype)) else: assert transcript.coding, 'Non-coding NM_ transcript ' + transcript_id if eStart < cdsStart: transcript.exons.append(Exon(transcript, eStart, cdsStart, 'Exon', exon_number, biotype='UTR')) if eEnd > cdsEnd: transcript.exons.append(Exon(transcript, cdsEnd, eEnd, 'Exon', exon_number, biotype='UTR')) transcript.exons.append(Exon(transcript, max(cdsStart, eStart), min(cdsEnd, eEnd), 'CDS', exon_number)) return gene_by_name_and_chrom
def get_executable(): sys_path = which('sambamba') if not sys_path: critical('Error: sambamba executable is not found') return sys_path
def check_genome(genome): if genome not in SUPPORTED_GENOMES: critical('Genome ' + str(genome) + ' is not supported. Supported genomes: ' + ', '.join(SUPPORTED_GENOMES))
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath