def _proc_file(inp_f, out_f, ctx=None): max_bunch_size = 1000 * 1000 written_lines = 0 bunch = [] for i, line in enumerate(inp_f): clean_line = line.replace('\n', '') if clean_line: if ctx: new_l = proc_line_fun(clean_line, i, ctx) else: new_l = proc_line_fun(clean_line, i) if new_l is not None: bunch.append(new_l + '\n') written_lines += 1 else: bunch.append(line) written_lines += 1 if len(bunch) >= max_bunch_size: out_f.writelines(bunch) debug('Written lines: ' + str(written_lines)) bunch = [] out_f.writelines(bunch) debug('Written lines: ' + str(written_lines))
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def run_prank(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) cmdl = prank_bin + ' -d=' + merged_fasta_fpath + ' -o=' + prank_out + ' -showtree' log.debug('Starting prank ' + cmdl) proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # lines = [] # prev_time = time.time() for stdout_line in iter(proc.stdout.readline, ''): print stdout_line.rstrip() # lines.append(stdout_line) cur_time = time.time() # if cur_time - prev_time > 2: emit('running', json.dumps({ 'finished': False, 'lines': [stdout_line.rstrip()], }) ) # lines = [] emit('running', json.dumps({ 'finished': True, 'lines': [], }) )
def requanitify_pizzly(pizzly_ref_fa, fusions_fasta, work_dir, fastq): """ Returns dict fusion-fasta-id -> {length eff_length est_counts tpm} """ trx_with_fusions = join(work_dir, 'transcripts_with_fusions.fasta.gz') kidx = join(work_dir, 'transcripts_with_fusions.kidx') if not isfile(trx_with_fusions): run_simple( f"cat {pizzly_ref_fa} {fusions_fasta} | gzip -c > {trx_with_fusions}" ) if not isfile(kidx): run_simple(f"kallisto index -k31 -i {kidx} {trx_with_fusions}") abundance = join(work_dir, 'abundance.tsv') if not isfile(abundance): run_simple(f"kallisto quant -i {kidx} -o {work_dir} {' '.join(fastq)}") logger.debug(f'Reading expression from {abundance}') expr_by_fusion = dict() with open(abundance) as f: header = f.readline().strip().split('\t') for row in csv.DictReader(f, delimiter='\t', fieldnames=header): expr_by_fusion[row['target_id']] = row return expr_by_fusion
def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}' if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def get_all_features(genome, high_confidence=False, features=None, gene_names=None, only_canonical=False): _canon_filt = get_only_canonical_filter(genome) if only_canonical else None ori_genome = genome genome = genome.replace('GRCh37', 'hg19') genome = genome.replace('GRCh38', 'hg38') bed = _get_ensembl_file('ensembl.bed', genome) def _filter(x): if high_confidence: if x[BedCols.HUGO] in ['', '.', None]: return False if features: if x[BedCols.FEATURE] not in features: return False if gene_names: if x[BedCols.GENE] not in gene_names: return False if _canon_filt: if not _canon_filt(x): return False return True debug('Filtering BEDTool for: HUGO annotation, specific features, specific genes, canonical') bed = bed.filter(_filter) if ori_genome.startswith('GRCh'): def fix_chr(r): r.chrom = r.chrom.replace('chrM', 'MT').replace('chr', '') return r bed = bed.each(fix_chr) return bed
def extract_features(output_file, genome, only_canonical, high_confidence, coding_only, feature_types): """ For debug purposes """ debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') feature_types = feature_types or ['exon', 'CDS', 'stop_codon', 'transcript'] features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in feature_types) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') features_bed.saveas(output_file) debug(f'Saved features to {output_file}')
def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None, check_result=True, overwrite=False, reuse=True, ctx=None): assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix) output_fpath = output_fpath or intermediate_fname(work_dir, input_fpath, suf=suffix) if output_fpath.endswith('.gz'): debug('output_fpath is .gz, but writing to uncompressed.') output_fpath = splitext(output_fpath)[0] if not overwrite: if can_reuse(output_fpath, cmp_f=input_fpath): debug('Reusing ' + output_fpath) return output_fpath if can_reuse(output_fpath + '.gz', cmp_f=input_fpath): debug('Reusing ' + output_fpath + '.gz') return output_fpath if islink(output_fpath): os.unlink(output_fpath) debug('Writing to ' + output_fpath) with file_transaction(work_dir, output_fpath) as tx_fpath: with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f: if ctx: convert_file_fn(inp_f, out_f, ctx) else: convert_file_fn(inp_f, out_f) if suffix or output_fpath: debug('Saved to ' + output_fpath) verify_file(output_fpath, is_critical=check_result) return output_fpath
def check_md5(work_dir, fpath, file_type, silent=False): md5_fpath = join(work_dir, file_type + '_md5.txt') new_md5 = md5(fpath) info('md5 of ' + fpath + ' is ' + str(new_md5)) prev_md5 = None if isfile(md5_fpath): with open(md5_fpath) as f: prev_md5 = f.read() else: info('Previous md5 file ' + md5_fpath + ' does not exist') info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5)) if prev_md5 == new_md5: if not silent: debug('Reusing previous ' + file_type.upper() + ' files.') return True else: if not silent: info('Pre-processing input ' + file_type.upper() + ' file') if prev_md5: if not silent: info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5)) info('New ' + file_type.upper() + ' md5: ' + str(new_md5)) with open(md5_fpath, 'w') as f: f.write(str(new_md5)) return False
def run_analysis_socket_handler(project_names_line): log.debug('Recieved request to start analysis for ' + project_names_line) ws = request.environ.get('wsgi.websocket', None) if not ws: raise RuntimeError('Environment lacks WSGI WebSocket support') def _run_cmd(cmdl): log.debug(cmdl) proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE, env=os.environ) for stdout_line in iter(proc.stdout.readline, None): if not stdout_line: break if not six.PY2: stdout_line = stdout_line.decode() if '#(' not in stdout_line.strip(): _send_line(ws, stdout_line) log.debug('Exit from the subprocess') manage_py = abspath(join(dirname(__file__), '..', 'manage.py')) _run_cmd(sys.executable + ' ' + manage_py + ' analyse_projects ' + project_names_line) run = Run.find_by_project_names_line(project_names_line) if not run: _send_line(ws, 'Run ' + str(run.id) + ' for projects ' + project_names_line + ' cannot be found. Has genotyping been failed?', error=True) ws.send(json.dumps({'finished': True})) return ''
def convert_file(work_dir, input_fpath, convert_file_fn, suffix=None, output_fpath=None, check_result=True, overwrite=False, reuse=True, ctx=None): assert output_fpath or suffix, str(output_fpath) + ' ' + str(suffix) output_fpath = output_fpath or intermediate_fname(work_dir, input_fpath, suf=suffix) if output_fpath.endswith('.gz'): debug('output_fpath is .gz, but writing to uncompressed.') output_fpath = splitext(output_fpath)[0] if not overwrite: if can_reuse(output_fpath, cmp_f=input_fpath): debug('Reusing ' + output_fpath) return output_fpath if can_reuse(output_fpath + '.gz', cmp_f=input_fpath): debug('Reusing ' + output_fpath + '.gz') return output_fpath if islink(output_fpath): os.unlink(output_fpath) debug('Writing to ' + output_fpath) with file_transaction(work_dir, output_fpath) as tx_fpath: with open_gzipsafe(input_fpath) as inp_f, open(tx_fpath, 'w') as out_f: if ctx: convert_file_fn(inp_f, out_f, ctx) else: convert_file_fn(inp_f, out_f) if suffix or output_fpath: debug('Saved to ' + output_fpath) verify_file(output_fpath, is_critical=check_result) return output_fpath
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def _proc_file(inp_f, out_f, ctx=None): max_bunch_size = 1000 * 1000 written_lines = 0 bunch = [] for i, line in enumerate(inp_f): clean_line = line.replace('\n', '') if clean_line: if ctx: new_l = proc_line_fun(clean_line, i, ctx) else: new_l = proc_line_fun(clean_line, i) if new_l is not None: bunch.append(new_l + '\n') written_lines += 1 else: bunch.append(line) written_lines += 1 if len(bunch) >= max_bunch_size: out_f.writelines(bunch) debug('Written lines: ' + str(written_lines)) bunch = [] out_f.writelines(bunch) debug('Written lines: ' + str(written_lines))
def set_up_log(log_dir, log_fname): log_fpath = join(log_dir, log_fname) logger.set_log_path(log_fpath, save_previous=True) debug('Logging to ' + log_fpath) debug() return log_fpath
def set_up_log(log_dir, log_fname): log_fpath = join(log_dir, log_fname) logger.set_log_path(log_fpath, save_previous=True) debug('Logging to ' + log_fpath) debug() return log_fpath
def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}' if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def check_md5(work_dir, fpath, file_type, silent=False): md5_fpath = join(work_dir, file_type + '_md5.txt') new_md5 = md5(fpath) info('md5 of ' + fpath + ' is ' + str(new_md5)) prev_md5 = None if isfile(md5_fpath): with open(md5_fpath) as f: prev_md5 = f.read() else: info('Previous md5 file ' + md5_fpath + ' does not exist') info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5)) if prev_md5 == new_md5: if not silent: debug('Reusing previous ' + file_type.upper() + ' files.') return True else: if not silent: info('Pre-processing input ' + file_type.upper() + ' file') if prev_md5: if not silent: info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5)) info('New ' + file_type.upper() + ' md5: ' + str(new_md5)) with open(md5_fpath, 'w') as f: f.write(str(new_md5)) return False
def get_parallel_view(n_samples, parallel_cfg): if parallel_cfg.scheduler and parallel_cfg.threads > 1: debug('Starting' + (' test' if not is_cluster() else '') + ' cluster (scheduler: ' + parallel_cfg.scheduler + ', queue: ' + parallel_cfg.queue + ') ' 'using ' + str(parallel_cfg.num_jobs(n_samples)) + ' nodes, ' + str(parallel_cfg.cores_per_job(n_samples)) + ' threads per each sample') return ClusterView(n_samples, parallel_cfg) else: debug('Running locally using ' + str(parallel_cfg.num_jobs(n_samples)) + ' thread(s)') return ThreadedView(n_samples, parallel_cfg)
def bam_to_bed(bam_fpath, to_gzip=True): debug('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def get_merged_cds(genome): """ Returns all CDS merged, used: - for TargQC general reports CDS coverage statistics for WGS - for Seq2C CNV calling when no capture BED available """ bed = get_all_features(genome) debug('Filtering BEDTool for high confidence CDS and stop codons') return bed\ .filter(lambda r: r.fields[BedCols.FEATURE] in ['CDS', 'stop_codon'])\ .filter(high_confidence_filter)\ .merge()
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir(adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def get_parallel_view(n_samples, parallel_cfg): if parallel_cfg.scheduler and parallel_cfg.threads > 1: debug('Starting' + (' test' if not is_cluster() else '') + ' cluster (scheduler: ' + parallel_cfg.scheduler + ', queue: ' + parallel_cfg.queue + ') ' 'using ' + str(parallel_cfg.num_jobs(n_samples)) + ' nodes, ' + str(parallel_cfg.cores_per_job(n_samples)) + ' threads per each sample') return ClusterView(n_samples, parallel_cfg) else: debug('Running locally using ' + str(parallel_cfg.num_jobs(n_samples)) + ' thread(s)') return ThreadedView(n_samples, parallel_cfg)
def __init__(self, input_dir=None, silent=False, include_samples=None, exclude_samples=None, genome_build=None, **kwargs): BaseProject.__init__(self, input_dir=input_dir, **kwargs) self.genome_build = genome_build debug(f'Parsing project {input_dir}') self.batch_by_name = DragenProject.find_batches(self.dir, silent=silent, include_samples=include_samples, exclude_samples=exclude_samples, parent_project=self) if len(self.batch_by_name) == 1: self.project_name = list(self.batch_by_name.values())[0].name else: self.project_name = basename(input_dir)
def _run_cmd(cmdl): log.debug(cmdl) proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE, env=os.environ) for stdout_line in iter(proc.stdout.readline, None): if not stdout_line: break if not six.PY2: stdout_line = stdout_line.decode() if '#(' not in stdout_line.strip(): _send_line(ws, stdout_line) log.debug('Exit from the subprocess')
def safe_symlink_to(fpath, dst_dirpath, rel=False): if rel: fpath = os.path.relpath(fpath, dst_dirpath) dst = join(dst_dirpath, basename(fpath)) if not exists(dst): try: if os.lstat(dst): # broken symlink os.remove(dst) except OSError: pass debug('Symlink ' + fpath + ' -> ' + dst) os.symlink(fpath, dst) return dst
def bam_to_bed(bam_fpath, to_gzip=True): debug( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def can_reuse(fpath, cmp_f, silent=False): """Check if a file `fpath` exists, is non-empty and is more recent than `cmp_f` """ do_reuse = os.environ.get('REUSE', '1') if do_reuse == '0': return False if not fpath or not isfile(fpath): return False elif verify_file(fpath, cmp_f=cmp_f, silent=True): if not silent: debug('Reusing ' + fpath) return True else: return False
def get_or_create_run(projects, parall_view=None): genomes = set([p.genome for p in projects]) if len(genomes) > 1: log.critical('Error: multiple genomes in projects: ' + str(genomes)) run = Run.find_by_projects(projects) if run and run.rerun_on_usercall: log.info() log.info('Rebuilding tree on usercall') build_tree(run) run.rerun_on_usercall = False db.session.commit() return run if run and not Run.is_ready(run): log.debug('Tree files do not exist, recreating run for projects ' + ', '.join(p.name for p in projects)) db.session.delete(run) db.session.commit() run = None if run: log.debug('Found run for ' + ', '.join([p.name for p in projects]) + ' with ID ' + str(run.id)) else: log.debug('Creating new run for projects ' + ', '.join(p.name for p in projects)) run = Run.create(projects, parall_view) log.debug('Done creating new run with ID ' + str(run.id)) return run
def can_reuse(fpath, cmp_f, silent=False): """Check if a file `fpath` exists, is non-empty and is more recent than `cmp_f` """ do_reuse = os.environ.get('REUSE', '1') if do_reuse == '0': return False if not fpath or not isfile(fpath): return False elif verify_file(fpath, cmp_f=cmp_f, silent=True): if not silent: debug('Reusing ' + fpath) return True else: return False
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def safe_symlink_to(fpath, dst_dirpath, rel=False): if rel: fpath = os.path.relpath(fpath, dst_dirpath) dst = join(dst_dirpath, basename(fpath)) if not exists(dst): try: if os.lstat(dst): # broken symlink os.remove(dst) except OSError: pass debug('Symlink ' + fpath + ' -> ' + dst) os.symlink(fpath, dst) return dst
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): import pybedtools pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = pybedtools.BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir( adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter('--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate( input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), (['-c', '--canonical'], dict( dest='canonical', action='store_true', help='Use canonical only', )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: logger.critical( 'Error: please, specify genome build name with -g (e.g. `-g hg19`)' ) genome = opts.genome logger.debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: logger.critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) logger.warn('Extracting features from Ensembl GTF') features_bed = features_bed.filter( lambda x: x[ebl.BedCols.FEATURE] == 'CDS') if opts.canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) logger.warn('Saving CDS regions...') output_fpath = adjust_path( join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) logger.warn('Done, saved to ' + output_fpath)
def get_gtf_db(gtf, in_memory=False): """ create a gffutils DB """ db_file = gtf + '.db' if gtf.endswith('.gz'): db_file = gtf[:-3] + '.db' if file_exists(db_file): return gffutils.FeatureDB(db_file) db_file = ':memory:' if in_memory else db_file if in_memory or not file_exists(db_file): debug('GTF database does not exist, creating...') infer_extent = guess_infer_extent(gtf) db = gffutils.create_db(gtf, dbfn=db_file, infer_gene_extent=infer_extent) return db else: return gffutils.FeatureDB(db_file)
def calc_genomic_bp_pos(self): genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord( self.trx, self.bp_offset) if genomic_coord is None: logger.critical(f' Error: could not convert transcript {id} ' f'offset {genomic_coord} to genomic coordinate') return False if genomic_coord == -1: logger.debug( f' Fusion in takes the entire transcript {self.trx.id} ' f'(genomic_coord={genomic_coord}, bp_offset={self.bp_offset}). ' f'That\'s suspicious, so we are skipping it.') return False self.bp_genomic_pos = genomic_coord self.bp_is_in_intron = is_in_intron return True
def run_processing(project_names_line, redirect_to, email=None): pnames = project_names_line.split('--') log.debug(f'Recieved request to start analysis for {project_names_line}') run_id = hashlib.sha256(str(project_names_line).encode()).hexdigest() run_log = join(DATA_DIR, f'run_{run_id}.log') if isfile(run_log): msg = f'''<p>Run for projects {project_names_line} already started. Please, wait until it finished. <br> <p>Follow the log at:</p> <pre>{run_log}</pre> <p>And reload the page when it\'s finished.</p>''' else: manage_py = abspath(join(dirname(__file__), '..', 'manage.py')) vardict = which('vardict') assert vardict, 'vardict is not in PATH. Are you running from "clearup" environment?' back_url = f"http://{clearup.HOST_IP}:{clearup.PORT}{redirect_to}" cmdl = f'{sys.executable} {manage_py} analyse_projects {project_names_line} --back_url={back_url}' if email: cmdl += f' --email={email}' log.debug(cmdl) process = subprocess.Popen(cmdl, stderr=subprocess.STDOUT, stdout=open(run_log, 'w'), env=os.environ, close_fds=True, shell=True) msg = f'''<p>Starting analysis with a command line:</p> <pre>{cmdl}</pre> <p>Process is running under ID={process.pid}. Follow the log at:</p> <pre>{run_log}</pre> <p>And reload the page when it\'s finished.</p>''' if email: msg += f'<br>When the run finished, you will be notified by an email sent to {email}' return render_template('submitted.html', projects=pnames, title='Comparing projects ' + ', '.join(pnames), project_names_line=project_names_line, redirect_to=redirect_to, message=msg)
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def _get(relative_path, genome=None): """ :param relative_path: relative path of the file inside the repository :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20, in case of BED, the returning BedTool will be with added filter. :return: BedTools object if it's a BED file, or filepath """ chrom = None if genome: if '-chr' in genome: genome, chrom = genome.split('-') check_genome(genome) relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if not isfile(path) and isfile(path + '.gz'): path += '.gz' if path.endswith('.bed') or path.endswith('.bed.gz'): if path.endswith('.bed.gz'): bedtools = which('bedtools') if not bedtools: critical('bedtools not found in PATH: ' + str(os.environ['PATH'])) debug('BED is compressed, creating BedTool') bed = BedTool(path) else: debug('BED is uncompressed, creating BedTool') bed = BedTool(path) if chrom: debug('Filtering BEDTool for chrom ' + chrom) bed = bed.filter(lambda r: r.chrom == chrom) return bed else: return path
def _sex_from_x_snps(vcf_file): log.debug('Calling sex from ' + vcf_file) het_calls_num = 0 hom_calls_num = 0 for rec in VCF(vcf_file): if rec.CHROM == 'chrX': if rec.num_het > 0: het_calls_num += 1 if rec.num_hom > 0: hom_calls_num += 1 if het_calls_num + hom_calls_num > 10: if het_calls_num > 1.5 * hom_calls_num: return 'F' elif het_calls_num < 0.5 * hom_calls_num: return 'M' else: log.debug( 'het/hom ratio on chrX is ' + str(het_calls_num / hom_calls_num) + ' - between 1.5 and 0.5, not confident enough to call sex.') else: log.debug('Total chrX calls number is ' + str(het_calls_num + hom_calls_num) + ' - less than 10, not confident enough to call sex.') return None
def phylo_tree_page(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)} work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) tree_fpath = os.path.join(prank_out + '.best.dnd') if not can_reuse(tree_fpath, merged_fasta_fpath): return render_template( 'processing.html', projects=[{ 'name': p.name, } for i, p in enumerate(projects)], run_id=run_id, title='Processing ' + ', '.join(project_names), ) log.debug('Prank results found, rendering tree!') tree = next(Phylo.parse(tree_fpath, 'newick')) seq_by_id = read_fasta(merged_fasta_fpath) tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id) all_samples_count = sum(len(p.samples.all()) for p in projects) return render_template( 'tree.html', projects=[{ 'name': p.name, 'color': color_by_proj[p.name], } for i, p in enumerate(projects)], title=', '.join(project_names), data=tree_json, tree_height=20 * all_samples_count, tree_width=5 * all_samples_count, )
def find_bam(self, silent=False): name = self.get_name_for_files() to_try = [ '-ready.cram', '-ready.bam', '-sort.bam', ] for ext in to_try: fpath = adjust_path(join(self.dirpath, name + ext)) if verify_file(fpath): return fpath input_file = self.sample_info['files'] if not isinstance(input_file, str): input_file = input_file[0] if isinstance(input_file, str) and input_file.endswith('.bam'): debug('Bcbio was run from BAM input') if not input_file.startswith('/'): input_file = abspath(join(self.bcbio_project.work_dir, input_file)) if verify_file(input_file): debug('Using BAM file from input YAML ' + input_file) return input_file else: debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist') if not silent: warn('No BAM or CRAM file found for ' + self.name)
def find_bam(self, silent=False): name = self.get_name_for_files() to_try = [ '-ready.bam', '-ready.cram', '-sort.bam', ] for ext in to_try: fpath = adjust_path(join(self.dirpath, name + ext)) if verify_file(fpath): return fpath input_file = self.sample_info['files'] if not isinstance(input_file, str): input_file = input_file[0] if isinstance(input_file, str) and input_file.endswith('.bam'): debug('Bcbio was run from BAM input') if not input_file.startswith('/'): input_file = abspath(join(self.parent_project.work_dir, input_file)) if verify_file(input_file): debug('Using BAM file from input YAML ' + input_file) return input_file else: debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist') if not silent: warn('No BAM or CRAM file found for ' + self.name)
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def find_raw_vcf(self, silent=False, caller=None): caller = caller or self.bcbio_project.somatic_caller vcf_fpath = None if self.batch and self.phenotype != 'normal': vcf_fpath = self.bcbio_project.find_vcf_file(self.batch.name, silent=silent, caller=caller) if not vcf_fpath: # in sample dir? if not silent: debug('-') debug('Not found VCF in the datestamp dir, looking at the sample-level dir') debug('-') vcf_fpath = self.bcbio_project.find_vcf_file_from_sample_dir( self, silent=silent or self.phenotype == 'normal', caller=caller) return vcf_fpath
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of chr_order, fai_fpath, or genome build name must be specified') chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append(Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def __init__(self, n_samples, parallel_cfg): BaseView.__init__(self, n_samples, parallel_cfg) self._view = CV(**parallel_cfg.get_cluster_params(n_samples)) debug('Starting cluster with ' + str(self.num_jobs) + ' open nodes, ' + str(self.cores_per_job) + ' cores per node')
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def run(self, fn, param_lists): debug('Starting multithreaded function' + str(fn)) assert self.n_samples == len(param_lists) return self._view(delayed(fn)(*params) for params in param_lists)
def load_from_sample_info(sample_info, bcbio_project, exclude_samples=None, include_samples=None, extra_batches=None, silent=False): # Get sample and batch names and exclude/include based on exclude_samples and include_samples description = str(sample_info['description']).replace('.', '_') batch_names = sample_info.get('metadata', dict()).get('batch') if isinstance(batch_names, int) or isinstance(batch_names, float): batch_names = str(batch_names) if isinstance(batch_names, str): batch_names = [batch_names] batch_names = [b.replace('.', '_') for b in batch_names if b] if exclude_samples: # Sample name if description in exclude_samples: if not silent: info(f'Skipping sample {description}') return None # Batch names if batch_names: filtered_batch_names = [b for b in batch_names if b not in exclude_samples] if not filtered_batch_names: if not silent: info(f'Skipping sample {description} with batch info {", ".join(batch_names)}') return None batch_names = filtered_batch_names if include_samples: # Sample name if description in include_samples: if not silent: info(f'Using sample {description} and all samples sharing batches {batch_names}') else: # Batch names if batch_names: incl_batch_names = [b for b in batch_names if b in include_samples] if incl_batch_names: if not silent: info(f'Using sample {description} with batch info {", ".join(batch_names)}') extr_batch_names = [b for b in batch_names if extra_batches and b in extra_batches] if extr_batch_names and not incl_batch_names: if not silent: info(f'Using sample {description} as it shares batches {extr_batch_names} with included samples') incl_batch_names += extr_batch_names if incl_batch_names: batch_names = incl_batch_names else: return None # Creating BcbioSample object s = BcbioSample(bcbio_project) s.sample_info = sample_info if 'description_original' in sample_info: s.old_name = str(sample_info['description_original']).replace('.', '_') # Setting phenotype and batches s.phenotype = sample_info.get('metadata', dict()).get('phenotype', 'tumor') if not batch_names: batch_names = [s.get_name_for_files() + '-batch'] if len(batch_names) > 1 and s.phenotype != 'normal': critical('Multiple batches for non-normal ' + s.phenotype + ' sample ' + s.name + ': ' + ', '.join(batch_names)) s.batch_names = batch_names # Setting genome build based reference paths s.genome_build = sample_info['genome_build'] s.variant_regions_bed = s.bcbio_project.config_path(val=sample_info['algorithm'].get('variant_regions')) s.sv_regions_bed = s.bcbio_project.config_path(val=sample_info['algorithm'].get('sv_regions')) or s.variant_regions_bed s.coverage_bed = s.bcbio_project.config_path(val=sample_info['algorithm'].get('coverage')) or s.sv_regions_bed if s.coverage_bed and not isfile(s.coverage_bed): if not silent: debug('coverage bed ' + str(s.coverage_bed) + ' not found. Looking relatively to genomes "basedir"') try: import az except ImportError: pass else: genome_cfg = az.get_refdata(s.genome_build) ref_basedir = genome_cfg.get('basedir') if not ref_basedir: critical('coverage bed ' + str(s.coverage_bed) + ' not found and "basedir" not provided in system config') s.coverage_bed = join(ref_basedir, 'coverage', 'prioritize', s.coverage_bed) + '.bed' s.is_rnaseq = 'rna' in sample_info['analysis'].lower() s.min_allele_fraction = (1.0/100) * float(sample_info['algorithm'].get('min_allele_fraction', 1.0)) if s.variant_regions_bed is None: s.coverage_interval = 'genome' else: s.coverage_interval = 'regional' s.is_wgs = s.coverage_interval == 'genome' if s._set_name_and_paths( name=description, variantcallers_data=sample_info['algorithm'].get('variantcaller'), ensemble='ensemble' in sample_info['algorithm'], silent=silent): return s else: return None
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ba.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical( f'Cannot parse the reference BED file - unexpected number of lines ' f'({len(inters_fields_list)} in {inters_fields_list} (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ba.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:])] = intersection_fields[ori_col_num:] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ba.BedCols.GENE] if not high_confidence else overlap_fields[ba.BedCols.HUGO] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ba.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def find_vcf_file(self, batch_name, silent=False, caller=None): caller = caller or self.somatic_caller vcf_fname = batch_name + '-' + caller + '.vcf' annot_vcf_fname = batch_name + '-' + caller + '-annotated.vcf' vcf_annot_fpath_gz = adjust_path(join(self.date_dir, annot_vcf_fname + '.gz')) # in datestamp var_raw_vcf_annot_fpath_gz = adjust_path(join(self.raw_var_dir, annot_vcf_fname + '.gz')) # in datestamp/var/raw vcf_fpath_gz = adjust_path(join(self.date_dir, vcf_fname + '.gz')) # in datestamp var_vcf_fpath_gz = adjust_path(join(self.var_dir, vcf_fname + '.gz')) # in datestamp/var var_raw_vcf_fpath_gz = adjust_path(join(self.raw_var_dir, vcf_fname + '.gz')) # in datestamp/var/raw vcf_fpath = adjust_path(join(self.date_dir, vcf_fname)) # in datestamp var_vcf_fpath = adjust_path(join(self.var_dir, vcf_fname)) # in datestamp/var var_raw_vcf_fpath = adjust_path(join(self.raw_var_dir, vcf_fname)) # in datestamp/var/raw if isfile(vcf_annot_fpath_gz): verify_file(vcf_annot_fpath_gz, is_critical=True) if not silent: info('Found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz) return vcf_annot_fpath_gz else: debug('Not found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz) if isfile(var_raw_vcf_annot_fpath_gz): verify_file(var_raw_vcf_annot_fpath_gz, is_critical=True) if not silent: info('Found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz) return var_raw_vcf_annot_fpath_gz else: debug('Not found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp dir ' + vcf_fpath_gz) return vcf_fpath_gz else: debug('Not found VCF in the datestamp dir ' + vcf_fpath_gz) if isfile(var_raw_vcf_fpath_gz): verify_file(var_raw_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz) return var_raw_vcf_fpath_gz else: debug('Not found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz) if isfile(vcf_fpath): verify_file(vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp dir ' + vcf_fpath) return vcf_fpath else: debug('Not found uncompressed VCF in the datestamp dir ' + vcf_fpath) if isfile(var_raw_vcf_fpath): verify_file(var_raw_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath) return var_raw_vcf_fpath else: debug('Not found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath) if isfile(var_vcf_fpath_gz): verify_file(var_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp/var dir ' + var_vcf_fpath_gz) return var_vcf_fpath_gz else: debug('Not found VCF in the datestamp/var dir ' + var_vcf_fpath_gz) if isfile(var_vcf_fpath): verify_file(var_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath) return var_vcf_fpath else: debug('Not found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath) if not silent: warn('Warning: no VCF found for batch ' + batch_name + ', ' + caller + ', gzip or ' 'uncompressed version in the datestamp directory.') return None
def find_vcf_file_from_sample_dir(sample, silent=False, caller=None): caller = caller or sample.bcbio_project.somatic_caller vcf_fname = sample.get_name_for_files() + '-' + caller + '.vcf' sample_var_dirpath = join(sample.dirpath, 'var') vcf_fpath_gz = adjust_path(join(sample.dirpath, vcf_fname + '.gz')) # in var var_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, vcf_fname + '.gz')) # in var var_raw_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname + '.gz')) # in var vcf_fpath = adjust_path(join(sample.dirpath, vcf_fname)) var_vcf_fpath = adjust_path(join(sample_var_dirpath, vcf_fname)) # in var var_raw_vcf_fpath = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname)) # in var if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF ' + vcf_fpath_gz) return vcf_fpath_gz else: debug('Not found VCF ' + vcf_fpath_gz) if isfile(var_vcf_fpath_gz): verify_file(var_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the var/ dir ' + var_vcf_fpath_gz) return var_vcf_fpath_gz else: debug('Not found VCF in the var/ dir ' + var_vcf_fpath_gz) if isfile(var_raw_vcf_fpath_gz): verify_file(var_raw_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz) return var_raw_vcf_fpath_gz else: debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz) if isfile(vcf_fpath): verify_file(vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF ' + vcf_fpath) return vcf_fpath else: debug('Not found uncompressed VCF ' + vcf_fpath) if isfile(var_vcf_fpath): verify_file(var_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the var/ dir ' + var_vcf_fpath) return var_vcf_fpath else: debug('Not found VCF in the var/ dir ' + var_vcf_fpath) if isfile(var_raw_vcf_fpath): verify_file(var_raw_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the var/raw/ dir ' + var_raw_vcf_fpath) return var_raw_vcf_fpath else: debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath) if not silent: warn('Warning: no VCF found for ' + sample.name + ' (' + caller + '), gzip or uncompressed version in and outside ' 'the var directory. Phenotype is ' + str(sample.phenotype)) return None
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath