def _add_to_ngb(work_dir, project_name, bam_by_sample, genome_build, bed_file, p_view): if is_us() or is_uk(): try: from az.ngb import add_bcbio_project_to_ngb, add_data_to_ngb, add_file_to_ngb except ImportError: log.warn( 'If you want to, install NGS Reporting with `conda install -v vladsaveliev ngs_reporting`' ) else: log.info('Exposing project to NGB...') try: dataset = project_name + '_Fingerprints' add_data_to_ngb(work_dir, p_view, bam_by_sample, dict(), dataset, bed_file=bed_file, genome=genome_build) add_file_to_ngb(work_dir, get_dbsnp(genome_build), genome_build, dataset, dataset, skip_if_added=True) except Exception: traceback.print_exc() log.err('Error: cannot export to NGB') log.info('*' * 70)
def open_gzipsafe(f, mode='r'): # mode_t = mode.replace('b', '') # mode_b = mode if 'b' in mode else mode + 'b' if f.endswith('.gz') or f.endswith('.gzip') or f.endswith('.gz.tx') or f.endswith('.gzip.tx'): try: h = gzip.open(f, mode=mode + 't', encoding='UTF-8') except IOError as e: err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text') return open(f, mode=mode) else: if 'w' in mode: return h else: try: h.read(1) except IOError as e: err('Error opening gzip ' + f + ': ' + str(e) + ', opening as plain text') h.close() return open(f, mode=mode) else: h.close() h = gzip.open(f, mode=mode + 't') return h else: return open(f, mode=mode)
def file_nonempty_check(output_fpath=None, input_fpath=None): if output_fpath is None: return True ok = file_exists_check(output_fpath) if not ok: err("Did not find non-empty output file {0}".format(output_fpath)) return ok
def file_exists_check(output_fpath=None, input_fpath=None): if output_fpath is None: return True ok = os.path.exists(output_fpath) if not ok: err("Did not find output file {0}".format(output_fpath)) return ok
def classify_tp53(self, aa_chg, pos, ref, alt): aa_chg = aa_chg.replace(' ', '') if str(pos) in self.splice_positions_by_gene['TP53'] and len( ref) == 1 and len(alt) == 1: return 6 aa_chg = aa_chg.replace('p.', '') aa_num = 0 if aa_chg: aa_num_str = re.sub('[^0-9]', '', aa_chg) if not aa_num_str: logger.err('TP53: cannot parse aa num from aa_chg=' + str(aa_chg)) else: aa_num = int(aa_num_str) if aa_snp_chg_pattern.match(aa_chg): for i in [1, 2, 3]: if aa_chg in self.tp53_groups['Group ' + str(i)]: return i elif stop_gain_pattern.match(aa_chg): if aa_num < 359: return 4 elif fs_pattern.match(aa_chg): if aa_num < 359: return 5 return None
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def run_prank(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) cmdl = prank_bin + ' -d=' + merged_fasta_fpath + ' -o=' + prank_out + ' -showtree' log.debug('Starting prank ' + cmdl) proc = subprocess.Popen(cmdl.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) # lines = [] # prev_time = time.time() for stdout_line in iter(proc.stdout.readline, ''): print stdout_line.rstrip() # lines.append(stdout_line) cur_time = time.time() # if cur_time - prev_time > 2: emit('running', json.dumps({ 'finished': False, 'lines': [stdout_line.rstrip()], }) ) # lines = [] emit('running', json.dumps({ 'finished': True, 'lines': [], }) )
def send_file_for_igv(fpath): # handle igv.js Range header which it uses to request a subset of a BAM file: range_header = request.headers.get('Range', None) if not range_header: return send_file(fpath) m = re.search('(\d+)-(\d*)', range_header) if not m: error_msg = "ERROR: unexpected range header syntax: %s" % range_header log.err(error_msg) return error_msg size = os.path.getsize(fpath) offset = int(m.group(1)) length = int(m.group(2) or size) - offset with open(fpath, 'rb') as f: f.seek(offset) data = f.read(length) rv = Response(data, 206, mimetype="application/octet-stream", direct_passthrough=True) rv.headers.add( 'Content-Range', 'bytes {0}-{1}/{2}'.format(offset, offset + length - 1, size)) log.info("GET range request: %s-%s %s" % (m.group(1), m.group(2), fpath)) return rv
def file_exists_check(output_fpath=None, input_fpaths=None): if output_fpath is None: return True ok = os.path.exists(output_fpath) if not ok: err('Did not find output file {output_fpath}') return ok
def file_nonempty_check(output_fpath=None, input_fpaths=None): if output_fpath is None: return True ok = verify_file(output_fpath) if not ok: err(f'Did not find non-empty output file {output_fpath}') return ok
def offset_to_genome_coord(trx, offset): genomic_coord = None is_in_intron = None length = len(trx) offset = offset if trx.strand == '+' else length - offset if offset == 0 or offset == length: return -1, None assert 0 < offset < length, f'Coordinate {offset} must be above 0 and below transcript length {length}, ' \ f'transcript: {trx}' if not trx.exons: logger.err(f' No exons for transcript {trx.id}') return None, None offset_remain = offset # print(' looking for coord', coord, f', in {len(transcript.exons)} exons, total length {length}') exons = trx.exons if trx.strand == '-': exons = reversed(exons) for exon in exons: assert offset_remain > 0 # print(' exon len=', len(exon)) # print(' offset_remain=', offset_remain) next_offset_remain = offset_remain - len(exon) if next_offset_remain <= 0: # print(' returning exon.start + offset_remain = ', exon.start + offset_remain) genomic_coord = exon.start - 1 + offset_remain # -1 to convert from 1-based to 0-based is_in_intron = next_offset_remain == 0 break offset_remain = next_offset_remain assert genomic_coord is not None # correct code should always produce something return genomic_coord, is_in_intron
def detect_bcbio_dir(input_dir, silent=False): """ :param input_dir: `config` dir, or `final` dir, or datestamp dir, or the directory root to `final` :return: (config_dir, final_dir, date_dir) """ config_dir, final_dir, date_dir = None, None, None input_dir = abspath(input_dir) # We are inside `*final*` if 'final' in basename(input_dir): # allow prefixes and postfixes final_dir = input_dir root_dir = dirname(final_dir) config_dir = join(root_dir, 'config') if not isdir(config_dir): err(f'Are you running on a bcbio output?\n' f'The input folder appear to be `final` ({input_dir}), ' f'however can\'t find `config` directory at the same level ({config_dir})') raise NoConfigDirException('No config dir') # We are inside `config` elif basename(input_dir) == 'config': config_dir = input_dir # We are in a parent dir to `config` (and possibly `final`, called otherwise) elif isdir(join(input_dir, 'config')): config_dir = join(input_dir, 'config') # We are inside a date dir elif isdir(abspath(join(input_dir, pardir, pardir, 'config'))): final_dir = abspath(join(input_dir, pardir)) root_dir = abspath(join(input_dir, pardir, pardir)) config_dir = abspath(join(root_dir, 'config')) # if 'final' not in basename(final_dir): # err(f'Are you running on a bcbio output?\n' # f'Found config directory 2 level up at {config_dir}, assuming your input {input_dir} ' # f'is a datestamp directory. However, the parent directory is not called `*final*`') # raise NoConfigDirException('No final dir') else: if not silent: err(f'Are you running on a bcbio output?\n' f'{input_dir} is not `config` or `*final*`, and ' f'can\'t find a `config` directory at {join(input_dir, "config")}, ' f'or {abspath(join(input_dir, pardir, "config"))}.' f'Make sure that you changed to a bcbio root or final directory, ' f'or provided it as a first argument.') raise NoConfigDirException('No config dir') if not silent: if not silent: info(f'Bcbio config directory: ' + config_dir) if final_dir: if not silent: info('"final" directory: ' + final_dir) if date_dir: if not silent: info('"datestamp" directory: ' + date_dir) return config_dir, final_dir, date_dir
def count_bed_cols(bed_fpath): with open(bed_fpath) as f: for l in f: if l and l.strip() and not l.startswith('#'): return len(l.split('\t')) # return len(next(dropwhile(lambda x: x.strip().startswith('#'), open(bed_fpath))).split('\t')) err('Empty bed file: ' + bed_fpath) return None
def detect_bcbio_dir(input_dir, silent=False): """ :param input_dir: `config` dir, or `final` dir, or datestamp dir, or the directory root to `final` :return: (config_dir, final_dir, date_dir) """ config_dir, final_dir, date_dir = None, None, None input_dir = abspath(input_dir) # We are inside `*final*` if 'final' in basename(input_dir): # allow prefixes and postfixes final_dir = input_dir root_dir = dirname(final_dir) config_dir = join(root_dir, 'config') if not isdir(config_dir): err(f'Are you running on a bcbio output?\n' f'The input folder appear to be `final` ({input_dir}), ' f'however can\'t find `config` directory at the same level ({config_dir})') raise NoConfigDirException('No config dir') # We are inside `config` elif basename(input_dir) == 'config': config_dir = input_dir # We are in a parent dir to `config` (and possibly `final`, called otherwise) elif isdir(join(input_dir, 'config')): config_dir = join(input_dir, 'config') # We are inside a date dir elif isdir(abspath(join(input_dir, pardir, pardir, 'config'))): final_dir = abspath(join(input_dir, pardir)) root_dir = abspath(join(input_dir, pardir, pardir)) config_dir = abspath(join(root_dir, 'config')) # if 'final' not in basename(final_dir): # err(f'Are you running on a bcbio output?\n' # f'Found config directory 2 level up at {config_dir}, assuming your input {input_dir} ' # f'is a datestamp directory. However, the parent directory is not called `*final*`') # raise NoConfigDirException('No final dir') else: if not silent: err(f'Are you running on a bcbio output?\n' f'{input_dir} is not `config` or `*final*`, and ' f'can\'t find a `config` directory at {join(input_dir, "config")}, or {abspath(join(input_dir, pardir, "config"))}.' f'Make sure that you changed to a bcbio root or final directory, or provided it as a first argument.') raise NoConfigDirException('No config dir') if not silent: if not silent: info(f'Bcbio config directory: ' + config_dir) if final_dir: if not silent: info('"final" directory: ' + final_dir) if date_dir: if not silent: info('"datestamp" directory: ' + date_dir) return config_dir, final_dir, date_dir
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = _load_yaml(fpath) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath)) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def find_in_log(self, fname, is_critical=False, silent=True): options = [join(self.log_dir, fname), join(self.date_dir, fname)] for fpath in options: if isfile(fpath): return fpath if is_critical: critical('Log file not found as ' + ', '.join(options)) elif not silent: err('Log file not found as ' + ', '.join(options))
def add_user_call(run_id, prev_sample_id, edit_sample_id, snp_index): sample = Sample.query.filter_by(id=edit_sample_id).first() if not sample: log.err('Sample not found') return redirect('/' + run_id + '/' + prev_sample_id) fingerprint = sample.fingerprints.filter_by(index=snp_index).first() fingerprint.usercall = request.form['usercall'] db.session.commit() return redirect('/' + run_id + '/' + prev_sample_id)
def compare(fp1, fp2): try: res = scipy.stats.spearmanr(fp1.flatten(), fp2.flatten()) except ValueError as e: log.err(e) log.err('Error calculating correlation between fingerpirnts, ' 'likely too small numbrer of mutations. Try encreasing target ' 'size or filtering criteria, or decrease L.') return None, None else: return res.correlation, res.pvalue
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def run(self, fn, param_lists): if self.n_samples == 0: return [] assert self.n_samples == len(param_lists) n_params = len(param_lists[0]) for sample_i, params in enumerate(param_lists): if params is None: err('Parameter list for sample ' + str(sample_i) + ' is None') if len(params) != n_params: err('Parameter list for sample ' + str(sample_i) + ' (' + str(len(params)) + ') does not equal to the one for sample 1 (' + str(n_params) + ')') res = self._view.view.map(fn, *([params[param_i] for params in param_lists] for param_i in range(n_params))) return res
def get_metric(self, names): if isinstance(names, str): names = [names] if not self.sample_info or not self.sample_info.get('metrics'): return None metrics = self.sample_info['metrics'] val = None for k in metrics: if k.lower() in [n.lower() for n in names] and metrics[k] != 'NA': val = metrics[k] if val is None: err('Cannot find ' + ', '.join(names) + ' in metrics for ' + self.name) return val
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def server_error(error): log.err('Error: ' + str(error)) log.err(traceback.format_exc()) lines = [] for l in traceback.format_exc().split('\n'): if l.strip(): lines.append(l.replace(' ', ' ' * 4)) return render_template( 'error.html', title='Internal Server Error', error='Error: ' + str(error) + '', traceback=traceback.format_exc().split('\n')), \ 500
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def run(self, fn, param_lists): if self.n_samples == 0: return [] assert self.n_samples == len(param_lists) n_params = len(param_lists[0]) for sample_i, params in enumerate(param_lists): if params is None: err('Parameter list for sample ' + str(sample_i) + ' is None') if len(params) != n_params: err('Parameter list for sample ' + str(sample_i) + ' (' + str(len(params)) + ') does not equal to the one for sample 1 (' + str(n_params) + ')') res = self._view.view.map( fn, *([params[param_i] for params in param_lists] for param_i in range(n_params))) return res
def setup_tibanna(tibanna_id=None, buckets=None): try: subprocess.check_call(f'tibanna --version', shell=True) except subprocess.CalledProcessError: logger.err('Error: tibanna is not installed. Please run `pip install -S tibanna`') sys.exit(1) if not tibanna_id: tibanna_id = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) assert not check_tibanna_id_exists(tibanna_id), 'Random tibanna ID already exists: ' + tibanna_id step_func_name = f'tibanna_unicorn_{tibanna_id}' if not check_tibanna_id_exists(tibanna_id): buckets_str = '' if not buckets else ('-b ' + ','.join(buckets)) run_simple(f'tibanna deploy_unicorn -g {step_func_name} {buckets_str} --no-setenv') return step_func_name
def file_reasonable_size(output_fpath, input_fpath): ok = file_exists_check(output_fpath) if not ok: return ok # named pipes -- we can't calculate size if input_fpath.strip().startswith("<("): return True if input_fpath.endswith((".bam", ".gz")): scale = 7.0 else: scale = 10.0 orig_size = os.path.getsize(input_fpath) / pow(1024.0, 3) out_size = os.path.getsize(output_fpath) / pow(1024.0, 3) if out_size < (orig_size / scale): err("Output file unexpectedly small. %.1fGb for output versus " "%.1fGb for the input file. This often indicates a truncated " "BAM file or memory errors during the run." % (out_size, orig_size)) return False else: return True
def file_reasonable_size(output_fpath, input_fpaths): ok = file_nonempty_check(output_fpath) if not ok: return ok # named pipes -- we can't calculate size if input_fpaths[0].strip().startswith("<("): return True if input_fpaths[0].endswith((".bam", ".gz")): scale = 7.0 else: scale = 10.0 orig_size = os.path.getsize(input_fpaths[0]) / pow(1024.0, 3) out_size = os.path.getsize(output_fpath) / pow(1024.0, 3) if out_size < (orig_size / scale): err(f'Output file unexpectedly small. {out_size}Gb for output versus ' f'{orig_size}Gb for the input file. This often indicates a truncated ' 'BAM file or memory errors during the run.') return False else: return True
def phylo_tree_page(run_id): project_names = run_id.split(',') projects = [Project.query.filter_by(name=pn).first() for pn in project_names] if not projects: log.err('Projects ' + ', '.join(project_names) + ' not found in database') abort(404) color_by_proj = {p.name: PROJ_COLORS[i % len(PROJ_COLORS)] for i, p in enumerate(projects)} work_dirpath = safe_mkdir(join(config.DATA_DIR, '_AND_'.join(project_names))) safe_mkdir(work_dirpath) merged_fasta_fpath = merge_fasta(projects, work_dirpath) prank_out = os.path.join(work_dirpath, os.path.splitext(os.path.basename(merged_fasta_fpath))[0]) tree_fpath = os.path.join(prank_out + '.best.dnd') if not can_reuse(tree_fpath, merged_fasta_fpath): return render_template( 'processing.html', projects=[{ 'name': p.name, } for i, p in enumerate(projects)], run_id=run_id, title='Processing ' + ', '.join(project_names), ) log.debug('Prank results found, rendering tree!') tree = next(Phylo.parse(tree_fpath, 'newick')) seq_by_id = read_fasta(merged_fasta_fpath) tree_json = tree_to_json_for_d3(tree, seq_by_id, color_by_proj, run_id=run_id) all_samples_count = sum(len(p.samples.all()) for p in projects) return render_template( 'tree.html', projects=[{ 'name': p.name, 'color': color_by_proj[p.name], } for i, p in enumerate(projects)], title=', '.join(project_names), data=tree_json, tree_height=20 * all_samples_count, tree_width=5 * all_samples_count, )
def _get_approved_genes_by_kind(approved_genes, kind): if not approved_genes: return 'NOT FOUND' if len(approved_genes) > 1: approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id] if len(approved_genes_same_ucsc) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_ucsc) == 1: if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom): err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id, print_date=False) return approved_genes_same_ucsc[0].name # Ok, no genes with same ucsc id, or not the same chromosome for them. approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom] if len(approved_genes_same_chrom) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' + db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_chrom) == 1: g = approved_genes_same_chrom[0] info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom ' + db_chrom + ', picking it', print_date=False) if _check_gene_symbol(g, gene_symbol, db_id, db_chrom): return g.name else: return 'NOT FOUND' if len(approved_genes_same_chrom) == 0: err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom ' + db_chrom + '', print_date=False) return 'NOT FOUND' if len(approved_genes) == 1: if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom): info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as ' + kind + ')', print_date=False) return approved_genes[0].name return 'NOT FOUND'
def _proc_ensembl_gtf(inp, out, chr_order, additional_feature_list=None): if additional_feature_list is None: additional_feature_list = [] info('additional_feature_list = ' + str(additional_feature_list)) gene_by_name = OrderedDict() gene_by_id = OrderedDict() info('Parsing Ensembl input...') total_lines = 0 total_non_coding_genes = 0 for l in inp: if l and not l.startswith('#'): chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t') # if is_local(): # if chrom != '21': # continue total_lines += 1 if total_lines % 1000 == 0: info(str(total_lines / 1000) + 'k lines, ' + str(len(gene_by_name)) + ' genes found') sys.stdout.flush() try: _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:])) for t in props_line.split(';') if t.strip()) except ValueError: sys.stderr.write(format_exc()) sys.stderr.write(l) gene_symbol = _rm_quotes(_prop_dict['gene_name']) gene_id = _rm_quotes(_prop_dict['gene_id']) gene_biotype = _rm_quotes(_prop_dict['gene_biotype']) gene_source = _rm_quotes(_prop_dict['gene_source']) # if gene_symbol == 'PTENP1': # sys.stderr.write('PTENP1\n') if not ALL_EXONS and gene_biotype not in [ 'protein_coding', 'nonsense_mediated_decay', 'non_stop_decay', 'processed_transcript', 'polymorphic_pseudogene', 'sense_intronic', 'sense_overlapping', 'antisense', ] and not any(b in gene_biotype for b in ['RNA', 'IG_', 'TR_']): total_non_coding_genes += 1 continue full_feature_list = ['gene', 'CDS', 'stop_codon', 'exon'] + additional_feature_list if ALL_EXONS: full_feature_list = ['gene', 'exon'] # sys.stderr.write('Full feature list: ' + str(full_feature_list) + '\n') if feature not in full_feature_list: continue start, end = int(start) - 1, int(end) if int(end) <= int(start): info('Error: start > end: ' + l) continue chrom = parse_ensembl_chrom(chrom) if not chrom: continue if feature == 'gene': # assert gene_biotype == biotype, 'Gene: gene_biotype "' + gene_biotype + '" # do not match biotype "' + biotype + '" for ' + gene_symbol gene = Gene(chrom, chr_order.get(chrom), start, end, gene_symbol, strand, gene_biotype, gene_id, gene_source) if gene.name in gene_by_name: prev_gene = gene_by_name[gene.name] if gene.source != prev_gene.source: err(' Duplicated gene in different databases:') err(' This: ' + gene.__repr__()) err(' Prev: ' + prev_gene.__repr__()) # answer = raw_input('Which one to pick? This (1), prev (2), longest (Enter): ') # # if answer == '1' or answer == '' and gene.end - gene.start > # prev_gene.end - prev_gene.start: # del gene_by_name[prev_gene.name] # del gene_by_id[prev_gene.db_id] # # else: # continue if gene.source == 'ensembl' or prev_gene.source == 'havana': del gene_by_name[prev_gene.name] del gene_by_id[prev_gene.db_id] err(' Picking up this one.') if prev_gene.source == 'ensembl' or gene.source == 'havana': err(' Picking up previous one.') continue else: err(' Duplicated gene in ' + gene.source + ':') err(' ' + gene.__repr__()) prev_gene.start = min(prev_gene.start, gene.start) prev_gene.end = max(prev_gene.end, gene.end) prev_gene.feature = 'Multi_Gene' continue err('') gene_by_name[gene_symbol] = gene gene_by_id[gene_id] = gene elif feature in ['CDS', 'stop_codon'] \ or feature == 'exon' and ('RNA' in gene_biotype or ALL_EXONS) \ or feature in additional_feature_list: assert gene_symbol in gene_by_name, 'Error: ' + feature + ' record before gene record ' + \ gene_symbol + ', ' + gene_id + '; gene_by_name: ' + str(gene_by_name.keys()) gene = gene_by_name[gene_symbol] if gene.gene_id == gene_id: assert gene_biotype == gene.biotype, feature + ': gene_biotype "' + gene_biotype + \ '" do not match biotype "' + gene.biotype + '" for ' + gene_symbol exon = Exon(gene, start, end, gene_biotype, feature) gene.exons.append(exon) info() info( 'Processed ' + str(total_lines) + ' lines, ' + str(total_non_coding_genes) + ' non-coding genes skipped, ' + str(len(gene_by_name)) + ' coding genes found') info() return gene_by_name
def _send_line(ws, line, error=False): if error: log.err(line.rstrip()) else: log.debug(line.rstrip()) ws.send(json.dumps({'line': line.rstrip(), 'error': error}))
def choose_canonical(genes, canonical_transcripts_ids): not_found_in_canon_coding_num = 0 not_found_in_canon_coding_num_one_transcript = 0 not_found_in_canon_rna_num = 0 not_found_in_canon_other_num = 0 many_canon_coding_num = 0 many_canon_rna_num = 0 many_canon_other_num = 0 canon_genes = [] for g in genes: _canon_tx = [] for t in g.transcripts: if t.transcript_id in canonical_transcripts_ids: t.is_canonical = True _canon_tx.append(t) if len(_canon_tx) > 1: if any(t.coding for t in g.transcripts): many_canon_coding_num += 1 # Checking overlapping for i, t1 in enumerate(_canon_tx): for j in range(i + 1, len(_canon_tx)): t2 = _canon_tx[j] if t1.start <= t2.start < t1.end or t1.start <= t2.end < t1.end: err('Transcripts ' + t1.transcript_id + ' (' + str(t1.start) + ':' + str(t1.end) + ') and ' + t2.transcript_id + ' (' + str(t2.start) + ':' + str(t2.end) + ') ' + ' in gene ' + g.name + ' ' + g.chrom + ' overlap') elif any(not t.coding for t in g.transcripts): many_canon_rna_num += 1 else: many_canon_other_num += 1 if len(_canon_tx) == 0: if any(t.coding for t in g.transcripts): not_found_in_canon_coding_num += 1 if len(g.transcripts) == 1: not_found_in_canon_coding_num_one_transcript += 1 # longest_t = max(g.transcripts, key=Transcript.length) # longest_t.is_canonical = True elif any(not t.coding for t in g.transcripts): not_found_in_canon_rna_num += 1 else: not_found_in_canon_other_num += 1 g.canonical_transcripts = [t for t in g.transcripts if t.is_canonical] if len(g.canonical_transcripts) > 0: if g.canonical_transcripts: canon_genes.append(g) info('Coding genes with canonical transcripts: ' + str(sum(1 for g in canon_genes if any(t.coding for t in g.canonical_transcripts)))) info('Coding canonical transcripts: ' + str(sum(1 for g in canon_genes for t in g.canonical_transcripts if t.coding))) info('RNA genes with canonical transcripts: ' + str(sum(1 for g in canon_genes if any(not t.coding for t in g.canonical_transcripts)))) info('RNA canonical transcripts: ' + str(sum(1 for g in canon_genes for t in g.canonical_transcripts if not t.coding))) info() info('Coding genes with no canonical transcripts (picking longest out of the rest): ' + str(not_found_in_canon_coding_num)) info('RNA genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_rna_num)) info('Other genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_other_num)) info('Coding genes with many canonical transcripts (picking longest): ' + str(many_canon_coding_num)) info('RNA genes with many canonical transcripts (keeping all): ' + str(many_canon_rna_num)) info('Other genes with many canonical transcripts (keeping all): ' + str(many_canon_other_num)) return canon_genes
def render_closest_comparison_page(project_names_line, sample_id, selected_idx=None, rerun_if_usercall=True): run = Run.find_by_project_names_line(project_names_line) if not Run.is_ready(run) or (run.rerun_on_usercall and rerun_if_usercall): return run_processing(project_names_line, redirect_to=url_for( 'closest_comparison_page', project_names_line=project_names_line, sample_id=sample_id)) run = Run.find_by_project_names_line(project_names_line) if not run: log.err('Run ' + str(project_names_line) + ' not found') abort( 404, { 'message': 'Phylogenetic comparison for ' + str(project_names_line) + ' is not found' }) sample = Sample.query.get(sample_id) if not sample: log.err('Sample ' + sample_id + ' not found in ' + str(project_names_line)) abort( 404, { 'message': 'Sample ' + sample_id + ' not found in ' + str(project_names_line) }) matching_sample = _find_closest_match(sample, run) if not matching_sample: log.err('No matching sample for ' + sample.long_name()) abort(404, {'message': 'No matching sample for ' + sample.long_name()}) snps_dict = defaultdict(int) snp_tables = [] snp_records = [] snps_a_by_rsid = sample.snps_from_run(run) snps_b_by_rsid = matching_sample.snps_from_run(run) ngb_link_tmpl, ngb_link = None, None # if is_us() or is_uk(): # ngb_link_tmpl = get_ngb_link_template( # run.work_dir_path(), sample.name, sample.project.genome, sample.project.name, # sample.project.bed_fpath, matching_sample.name, matching_sample.bam) for i, l in enumerate(run.locations): snp_a = snps_a_by_rsid[l.rsid] snp_b = snps_b_by_rsid[l.rsid] # ngb_link = get_ngb_link(run.work_dir_path(), ngb_link_tmpl, snp_a.chrom, snp_a.pos) if ngb_link_tmpl else None snp_records.append( _get_snp_record(snps_dict, snp_a, snp_b, i + 1, ngb_link=ngb_link)) if (i + 1) % SNPS_IN_ROW == 0: snp_tables.append(snp_records) snp_records = [] if snp_records: snp_tables.append(snp_records) snps_dict['total_score'] = sum( (rec['score']) for recs in snp_tables for rec in recs) bam_fpath_a = '/%s/bams/%s' % (run.id, sample.long_name() + '.bam') bam_fpath_b = '/%s/bams/%s' % (run.id, matching_sample.long_name() + '.bam') snps_bed = '/%s/snps_bed' % project_names_line sample_a = { 'id': sample.id, 'name': sample.name, 'project': sample.project.name, 'bam': bam_fpath_a, } sample_b = { 'id': matching_sample.id, 'name': matching_sample.name, 'project': matching_sample.project.name, 'bam': bam_fpath_b, } t = render_template( 'sample.html', project_names_line=project_names_line, genome=sample.project.genome, sampleA=sample_a, sampleB=sample_b, snps_data=snps_dict, snp_tables=snp_tables, snps_bed=snps_bed, selected_idx=selected_idx or "null", total_snps=sum([len(snps) for snps in snp_tables]), snps_in_row=SNPS_IN_ROW, ) return t
def __init__(self, genome, filt_cnf, tricky_regions_dir, transcripts_fpath, reg_exp_sample=None, platform=None): self.all_reject_counter = OrderedDefaultDict(int) self.all_counter = OrderedDefaultDict(int) self.gene_blacklist_counter = OrderedDefaultDict(int) self.region_blacklist_counter = OrderedDefaultDict(int) compendia_fpath = verify_file(filt_ref_data.compendia(genome), 'compendia_ms7_hotspot') actionable_fpath = verify_file(filt_ref_data.actionable(genome), 'actionable') filter_common_snp_fpath = verify_file(filt_ref_data.common_snp(genome), 'filter_common_snp') filter_common_arti_fpath = verify_file( filt_ref_data.common_art(genome), 'filter_common_artifacts') splice_fpath = verify_file(filt_ref_data.splice(genome), 'splice') suppressors_fpath = verify_file(filt_ref_data.suppressors(), 'suppressors') oncogenes_fpath = verify_file(filt_ref_data.oncogenes(), 'oncogenes') ruledir = verify_dir(filt_ref_data.ruledir(), 'ruledir') snpeffect_polymorph_fpath = verify_file( filt_ref_data.snpeffect_export_polymorphic(), 'snpeffect_export_polymorphic') actionable_hotspot_fpath = verify_file( filt_ref_data.actionable_hotspot(), 'actionable_hotspot') specific_mutations_fpath = verify_file( filt_ref_data.specific_mutations(), 'specific_mutations') last_critical_aa_fpath = verify_file(filt_ref_data.last_critical_aa(), 'last_critical_aa') incidentalome_dir = verify_dir(filt_ref_data.incidentalome_dir(), 'incidentalome') comments_fpath = verify_file(filt_ref_data.ngs_reports_comments(), 'ngs_reports_comments') if not all([ compendia_fpath, actionable_fpath, filter_common_snp_fpath, filter_common_arti_fpath, splice_fpath, suppressors_fpath, oncogenes_fpath, ruledir, snpeffect_polymorph_fpath, actionable_hotspot_fpath, specific_mutations_fpath, last_critical_aa_fpath, incidentalome_dir, comments_fpath, ]): logger.err( 'Error: some of the required files are not found or empty (see above)' ) self.suppressors = parse_genes_list(adjust_path(suppressors_fpath)) self.oncogenes = parse_genes_list(adjust_path(oncogenes_fpath)) self.reg_exp_sample = reg_exp_sample self.platform = platform transcripts_fpath = verify_file(transcripts_fpath, silent=True) if transcripts_fpath: logger.info('Using canonical transcripts from ' + transcripts_fpath) with open(transcripts_fpath) as f: self.transcripts = [tr.strip().split('.')[0] for tr in f] self.max_ratio = filt_cnf['max_ratio'] self.max_sample_cnt = filt_cnf['max_sample_cnt'] self.min_freq = filt_cnf['min_freq'] # for all variants self.act_min_freq = filt_cnf['act_min_freq'] self.act_min_freq = self.act_min_freq or self.min_freq // 2 self.germline_min_freq = filt_cnf['germline_min_freq'] self.filt_depth = filt_cnf['filt_depth'] self.min_vd = filt_cnf['min_vd'] self.min_gmaf = filt_cnf['min_gmaf'] self.keep_utr_intronic = filt_cnf['keep_utr_intronic'] self.keep_whole_genome = filt_cnf['keep_whole_genome'] self.keep_hla = filt_cnf['keep_hla'] self.damage_p_value = filt_cnf.get('damage_p_value') logger.info('Parsing filtering data...') self.tp53_groups = { 'Group 1': parse_mut_tp53(join(ruledir, 'DNE.txt')), 'Group 2': parse_mut_tp53(join(ruledir, 'TA0-25.txt')), 'Group 3': parse_mut_tp53(join(ruledir, 'TA25-50_SOM_10x.txt')) } self.splice_positions_by_gene = defaultdict(set) for l in iter_lines(splice_fpath): pos, g = l.split('\t') self.splice_positions_by_gene[g].add(pos) self.last_critical_aa_pos_by_gene = dict() for l in iter_lines(last_critical_aa_fpath): g, aa_pos, _ = l.split('\t') self.last_critical_aa_pos_by_gene[g] = int(aa_pos) self.filter_snp = set() for l in iter_lines(filter_common_snp_fpath): fields = l.split('\t') self.filter_snp.add('-'.join(fields[1:5])) self.snpeff_snp = set() self.snpeff_snp_rsids = set() for l in iter_lines(snpeffect_polymorph_fpath): fields = l.split('\t') snpeff_aachg = fields[2] snpeff_rsid = fields[5] if len(fields) > 11 and fields[11]: snpeff_gene = fields[11] self.snpeff_snp.add('-'.join([snpeff_gene, snpeff_aachg])) elif snpeff_rsid != '-': self.snpeff_snp_rsids.add(snpeff_rsid) self.filter_artifacts = set() self.filter_rules_by_gene = defaultdict(list) for l in iter_lines(filter_common_arti_fpath): fields = l.split('\t') if fields[5] == 'rule': gene, chrom, start, end, action, _, _, _, note = fields[:9] rule = Rule(gene, chrom=chrom, start=int(start), end=int(end), action=action, note=note) self.filter_rules_by_gene[gene].append(rule) else: gene, chrom, start, ref, alt = fields[:5] self.filter_artifacts.add('-'.join([chrom, start, ref, alt])) self.actionable_hotspot_by_gene = defaultdict(dict) self.common_snps_by_gene = defaultdict(set) with open(actionable_hotspot_fpath) as f: for l in f: l = l.replace('\n', '') if not l or l.startswith('##'): continue fields = l.split('\t') gene = fields[0] prot_change = fields[1] if gene.startswith('#'): # VUS, No special treatment for now gene = gene[1:] elif gene.startswith('^'): gene = gene[1:] self.common_snps_by_gene[gene].add(prot_change) else: is_somatic = fields[2] == 'somatic' self.actionable_hotspot_by_gene[gene][ prot_change] = 'somatic' if is_somatic else 'germline' self.ngs_reports_comments = defaultdict(dict) with open(comments_fpath) as f: for r in csv.DictReader( (row for row in f if not row.startswith('#')), delimiter='\t'): gene = r['Gene'] prot_change = r['AA_Change'] if gene.startswith('^'): gene = gene[ 1:] # remove leading ^ character, e.g. ^EGFR -> EGFR is_somatic = 'somatic' in r['Note'] self.actionable_hotspot_by_gene[gene][ prot_change] = 'somatic' if is_somatic else 'germline' else: self.ngs_reports_comments[gene][prot_change] = r['Note'] self.act_somatic = dict() self.act_germline = set() self.rules = defaultdict(list) for l in iter_lines(actionable_fpath): fields = l.split('\t') if fields[7] == 'germline': key = '-'.join(fields[1:5]) self.act_germline.add(key) elif fields[7] == 'somatic': change = fields[8].strip() if fields[6] == 'rule': if fields[4] == '*' and len(fields[3]) == 1: key = '-'.join(fields[1:4]) self.act_somatic[key] = change else: indel_type = '' if 'indel' in fields[5]: indel_type = 'indel' elif 'ins' in fields[5]: indel_type = 'ins' elif 'del' in fields[5]: indel_type = 'del' rule = Rule(gene=fields[0], chrom=fields[1], start=int(fields[2]), end=int(fields[3]), length=int(fields[4]), required_inframe='inframe' in fields[5], indel_type=indel_type, change=change) self.rules[rule.gene].append(rule) # elif fields[5] == inframe_del: # self.rules[inframe_del].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]]) # elif fields[5] == inframe_ins: # self.rules[inframe_ins].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]]) else: key = '-'.join(fields[1:5]) self.act_somatic[key] = change self.hotspot_nucleotides = set() self.hotspot_proteins = set() for l in iter_lines(compendia_fpath): fields = l.split('\t') if fields[5].startswith('g.'): continue self.hotspot_nucleotides.add('-'.join(fields[1:5])) if not fields[6]: continue self.hotspot_proteins.add('-'.join([fields[0], fields[6]])) logger.info('Parsing gene blacklists...') anno_cfg = get_anno_config() self.gene_blacklists_by_reason = parse_gene_blacklists( anno_cfg['blacklist']['genes'], incidentalome_dir) for r in self.gene_blacklists_by_reason.keys(): self.gene_blacklist_counter[r] = 0 self.gene_blacklist_counter['hardfilter'] = 0 # self.gene_to_soft_filter = list(iter_lines(join(incidentalome_dir, 'soft_filter.txt'))) # self.region_blacklists_by_reason = dict() # if tricky_regions_dir: # info('Parsing region blacklists...') # self.region_blacklists_by_reason = load_tricky_regions(anno_cfg['blacklist']['regions'], tricky_regions_dir) # for r in self.region_blacklists_by_reason.keys(): # self.region_blacklist_counter[r] = 0 logger.info('Parsing actionable rules and specific mutations...') self.tier_by_specific_mutations, self.tier_by_type_by_region_by_gene, self.sensitizations_by_gene\ = parse_specific_mutations(specific_mutations_fpath) if not all([ self.rules, self.splice_positions_by_gene, self.act_somatic, self.act_germline, self.actionable_hotspot_by_gene ]): if not self.rules: logger.err('No rules, cannot proceed') if not self.splice_positions_by_gene: logger.err('No tp53_positions, cannot proceed') if not self.act_somatic: logger.err('No act_somatic, cannot proceed') if not self.act_germline: logger.err('No act_germline, cannot proceed') if not self.actionable_hotspot_by_gene: logger.err('No actionable_hotspots, cannot proceed') self.status = None self.reason_by_status = None self.output_f = None self.fm_output_f = None self.rejected_output_f = None
def main(output_dir=None, normal_bam=None, tumor_bam=None, snv_vcf=None, normal_name=None, tumor_name=None, sample=None, genome=None, genomes_dir=None, gridss_ref_dir=None, ref_fa=None, threads=None, jvmheap=None): gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx')) gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts')) normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') sample = sample or tumor_name output_dir = safe_mkdir(abspath(output_dir or 'gridss')) logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True) output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf') assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet' if genomes_dir: refdata.find_genomes_dir(genomes_dir) if not gridss_ref_dir: gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir') if not ref_fa: ref_fa = ref_fa.get_ref_file(genome, 'fa') hmf_env_path = conda_utils.secondary_conda_env('hmf') gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0] amber_jar = glob.glob( join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0] cobalt_jar = glob.glob( join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0] purple_jar = glob.glob( join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0] linx_jar = glob.glob( join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0] cmd = f""" PATH={hmf_env_path}/bin:$PATH \ THREADS={threads} \ GRIDSS_JAR={gridss_jar} \ AMBER_JAR={amber_jar} \ COBALT_JAR={cobalt_jar} \ PURPLE_JAR={purple_jar} \ LINX_JAR={linx_jar} \ bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \ -n {normal_bam} \ -t {tumor_bam} \ -v {output_vcf} \ -s {sample} \ --normal_sample {normal_name} \ --tumour_sample {tumor_name} \ --snvvcf {snv_vcf} \ --ref_dir {gridss_ref_dir} \ --install_dir {gridss_scripts_dir} \ --reference {ref_fa} \ --output_dir {output_dir} \ {f"--jvmheap {jvmheap}" if jvmheap else ""} """.strip() try: run_simple(cmd) except subprocess.SubprocessError: err('--------\n') err(f'Error running GRIDSS-PURPLE-LINX.\n') raise
def _approve(gene_by_name, synonyms_fpath): approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym = \ read_approved_genes(synonyms_fpath) not_approved_gene_names = list() gene_after_approving_by_name = OrderedDict() total_approved = 0 total_not_approved = 0 j = 0 for g in gene_by_name.values(): if len(g.exons) == 0: continue gene_after_approving_by_name[g.name] = g if is_approved_symbol(g.name, approved_gene_by_name): gene_after_approving_by_name[g.name] = g total_approved += 1 else: not_approved_gene_names.append(g.name) total_not_approved += 1 j += 1 if j % 1000 == 0: info('processed ' + str(j / 1000) + 'k genes...') info('-----') info('Total: ' + str(j)) if approved_gene_by_name: info('Total approved: ' + str(total_approved)) info('Total not approved: ' + str(total_not_approved)) info() info('Saving genes...') gene_features = 0 features_counter = defaultdict(int) biotypes_counter = defaultdict(int) no_exon_gene_num = 0 filtered_gene_after_approving_by_name = OrderedDict() for g in gene_after_approving_by_name.values(): if len(g.exons) == 0: no_exon_gene_num += 1 else: filtered_gene_after_approving_by_name[g.name] = g gene_features += 1 features_counter[g.feature] += 1 biotypes_counter[g.biotype] += 1 for e in g.exons: features_counter[e.feature] += 1 if e.feature == 'exon': e.feature = 'Exon' elif e.feature == 'stop_codon': e.feature = 'CDS' else: e.feature = e.feature[0].upper() + e.feature[1:] info('Skipped {} genes with no sub-features.'.format(no_exon_gene_num)) info('Approved {} genes, including:'.format(gene_features)) info(' Gene: {}'.format(features_counter['Gene'])) info(' Multi_Gene: {}'.format(features_counter['Multi_Gene'])) info('') info('Out of total: {} protein coding genes, {} ncRNA genes, including:'.format( biotypes_counter['protein_coding'], sum(biotypes_counter.values()) - biotypes_counter['protein_coding'])) for bt, cnt in biotypes_counter.items(): if bt != 'protein_coding': err(' ' + bt + ': ' + str(cnt)) info() if ALL_EXONS: info('Found {} exons.'.format(features_counter['exon'])) else: info('Also found {} CDS, {} stop codons, and {} ncRNA exons.'.format( features_counter['CDS'], features_counter['stop_codon'], features_counter['exon'])) return filtered_gene_after_approving_by_name, not_approved_gene_names
def get_approved_gene_symbol(approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym, gene_symbol, db_id='', db_chrom='', indent=''): if gene_symbol in approved_gene_by_name: if _check_gene_symbol(approved_gene_by_name[gene_symbol], gene_symbol, db_id, db_chrom): return approved_gene_by_name[gene_symbol].name, None info(indent + 'Gene name ' + gene_symbol + ' is not approved, searching for an approved version... ', ending='', print_date=False) def _get_approved_genes_by_kind(approved_genes, kind): if not approved_genes: return 'NOT FOUND' if len(approved_genes) > 1: approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id] if len(approved_genes_same_ucsc) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_ucsc) == 1: if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom): err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id, print_date=False) return approved_genes_same_ucsc[0].name # Ok, no genes with same ucsc id, or not the same chromosome for them. approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom] if len(approved_genes_same_chrom) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' + db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_chrom) == 1: g = approved_genes_same_chrom[0] info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom ' + db_chrom + ', picking it', print_date=False) if _check_gene_symbol(g, gene_symbol, db_id, db_chrom): return g.name else: return 'NOT FOUND' if len(approved_genes_same_chrom) == 0: err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom ' + db_chrom + '', print_date=False) return 'NOT FOUND' if len(approved_genes) == 1: if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom): info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as ' + kind + ')', print_date=False) return approved_genes[0].name return 'NOT FOUND' res = _get_approved_genes_by_kind(approved_gnames_by_prev_gname.get(gene_symbol), 'prev') if res == 'AMBIGUOUS': return None, 'AMBIGUOUS\tAS PREV' elif res == 'NOT FOUND': res = _get_approved_genes_by_kind(approved_gnames_by_synonym.get(gene_symbol), 'synonym') if res == 'AMBIGUOUS': return None, res + '\tAS SYNONYM' if res == 'NOT FOUND': err(' not found.', print_date=False) return None, res else: info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as synonym): ' + res, print_date=False) return res, None else: info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as prev): ' + res, print_date=False) return res, None