def compare_pairwise(run): import itertools as it all_samples = [s for p in run.projects for s in p.samples] pairwise_dict = defaultdict(dict) for s1, s2 in it.combinations_with_replacement(all_samples, 2): snps_a_by_rsid = s1.snps_from_run(run) snps_b_by_rsid = s2.snps_from_run(run) matches = 0 total_locs = 0 for i, l in enumerate(run.locations): snp_a = snps_a_by_rsid[l.rsid] snp_b = snps_b_by_rsid[l.rsid] seq_a, seq_b = snp_a.get_gt(), snp_b.get_gt() if seq_a == 'NN' or seq_b == 'NN': pass elif seq_a == seq_b: matches += 2 elif seq_a[0] == seq_b[0] or seq_a[1] == seq_b[1]: matches += 1 total_locs += 2 dist = matches / total_locs log.info(f' {s1.name} VS {s2.name}: {dist:.2f}') pairwise_dict[s1.name][s2.name] = dist pairwise_dict[s2.name][s1.name] = dist return pairwise_dict
def update_batches(samples, silent=False): batch_by_name = {bn: Batch(bn) for bn in list(set([b for s in samples for b in s.batch_names]))} for sample in samples: for bn in sample.batch_names: batch_by_name[bn].name = bn sample.batch = batch_by_name[bn] if sample.phenotype == 'normal': if batch_by_name[bn].normal: critical('Multiple normal samples for batch ' + bn) batch_by_name[bn].normal = sample else: batch_by_name[bn].tumor = sample for batch in batch_by_name.values(): if batch.normal and not batch.tumor: if not silent: info('Batch ' + batch.name + ' contains only normal, treating sample ' + batch.normal.name + ' as tumor') batch.normal.phenotype = 'tumor' batch.normal.batch = batch batch.tumor = batch.normal batch.normal = None # setting up batch properties for b in batch_by_name.values(): b.tumor.normal_match = b.normal return batch_by_name
def _add_to_ngb(work_dir, project_name, bam_by_sample, genome_build, bed_file, p_view): if is_us() or is_uk(): try: from az.ngb import add_bcbio_project_to_ngb, add_data_to_ngb, add_file_to_ngb except ImportError: log.warn( 'If you want to, install NGS Reporting with `conda install -v vladsaveliev ngs_reporting`' ) else: log.info('Exposing project to NGB...') try: dataset = project_name + '_Fingerprints' add_data_to_ngb(work_dir, p_view, bam_by_sample, dict(), dataset, bed_file=bed_file, genome=genome_build) add_file_to_ngb(work_dir, get_dbsnp(genome_build), genome_build, dataset, dataset, skip_if_added=True) except Exception: traceback.print_exc() log.err('Error: cannot export to NGB') log.info('*' * 70)
def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False): if not date_dir: fc_date = bcbio_cnf.get('fc_date') fc_name = bcbio_cnf.get('fc_name') or 'project' if fc_date: # Date dirpath is from bcbio and named after fc_name, not our own project name date_dir = join(final_dir, fc_date + '_' + fc_name) if not create_dir and not verify_dir(date_dir, silent=True): critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}') else: if isdir(join(final_dir, 'project')): # bcbio-CWL? date_dir = join(final_dir, 'project') if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir) else: regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}'] date_dirs = [join(final_dir, dirpath) for dirpath in listdir(final_dir) if any(re.match(regex, dirpath) for regex in regexs)] if len(date_dirs) == 0: raise NoDateStampsException('Error: no datestamp directory!') elif len(date_dirs) == 1: date_dir = date_dirs[0] else: dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs] newest_date, newest_dir = sorted(dates, reverse=True)[0] newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir] if len(newest_dirs) > 1: raise MultipleDateStampsException(f'Error: multiple datestamp directory found, ' f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}') date_dir = newest_dirs[0] if not silent: info('Using the datestamp dir: ' + date_dir) if create_dir: safe_mkdir(date_dir) return date_dir
def find_germline_vcf(self, silent=False, caller=None): caller = caller or self.germline_caller if not caller: if not silent: warn(f'Batch {self.name} have no variant caler info assigned, skipping finding germline VCF') return assert caller # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019 vcf_fpath_gz = adjust_path(join(self.parent_project.date_dir, f'{self.normals[0].name}-germline-{caller}.vcf.gz')) # in datestamp. bcbio before 1.1.6 vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, f'{self.normals[0].name}-germline-{caller}-annotated.vcf.gz')) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}.vcf.gz: ' + vcf_fpath_gz) self.germline_vcf = vcf_fpath_gz elif isfile(vcf_old_fpath_gz): verify_file(vcf_old_fpath_gz, is_critical=True) if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz) self.germline_vcf = vcf_old_fpath_gz elif not silent: warn(f'Could not find germline variants files for batch {self.name}, caller {caller} neither as ' f'<date-dir>/<normal-name>-germline-{caller}.vcf.gz, nor as ' f'<date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)')
def get_ref_fasta(genome): if is_az(): path = '/ngs/reference_data/genomes/Hsapiens/' + genome + '/seq/' + genome + '.fa' if isfile(path): logger.info('Found genome fasta at ' + path) return path if isdir(join(DATA_DIR, 'genomes', genome)): genome_dir = safe_mkdir(join(DATA_DIR, 'genomes')) else: genome_dir = safe_mkdir(join(DATA_DIR, '..', 'genomes')) if genome not in genomepy.list_installed_genomes(genome_dir): genome_rec = [ rec for rec in genomepy.list_available_genomes() if rec[1] == genome ] if genome_rec: genome_rec = genome_rec[0] else: logger.critical('Error: genome ' + genome + ' is not available') logger.info('Downloading genome ' + genome + ' from ' + genome_rec[1] + ' and installing into ' + genome_dir) genomepy.install_genome(genome, 'UCSC', genome_dir=genome_dir) genome_fasta_file = genomepy.Genome(genome, genome_dir=genome_dir).filename return genome_fasta_file
def _split_reference_by_priority(cnf, features_bed_fpath): features = ['CDS', 'Exon', 'Transcript', 'Gene'] info('Splitting the reference file into ' + ', '.join(features)) features_and_beds = [] for f in features: features_and_beds.append((f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f))) return features_and_beds
def remove_run(project_names_line_or_id): try: id = int(project_names_line_or_id) except ValueError: project_names = project_names_line_or_id.split('--') projects = Project.query.filter(Project.name.in_(project_names)) if projects.count() < len(project_names): raise RuntimeError( 'Some projects in ' + str(project_names) + ' are not found in the database: ' + str(set(project_names) - set(p.name for p in projects))) run = Run.find_by_projects(projects) if not run: raise RuntimeError( 'Cannot find run ' + str(project_names_line_or_id) + ' - some projects are not found in the database: ' + str(set(project_names) - set(p.name for p in projects))) else: run = Run.query.filter(id == id).first() if run: log.info('Deleting run ' + str(run.id)) run.delete() db.session.commit() else: log.info('Coould not find run')
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth, parall_view=None): """ Picking random 3 samples and getting a callable for them. Trade off between looping through all samples in a huge batch, and hitting an sample with outstanding coverage. """ if can_reuse(output_bed_file, bam_files): return output_bed_file work_dir = safe_mkdir(join(work_dir, 'callable_work')) # random.seed(1234) # seeding random for reproducability # bam_files = random.sample(bam_files, min(len(bam_files), 3)) if parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) else: with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) good_overlap_sample_fraction = 0.8 # we want to pick those regions that have coverage at 80% of samples good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds)) info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} ' f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})') with file_transaction(work_dir, output_bed_file) as tx: pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) intersection = pybedtools.BedTool() \ .multi_intersect(i=callable_beds) \ .filter(lambda r: len(r[4].split(',')) >= good_overlap_count) intersection.saveas(tx) info(f'Saved to {output_bed_file}') return output_bed_file
def _do_run(cmd, checks, env=None, output_fpath=None, input_fpath=None): """Perform running and check results, raising errors for issues. """ cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd) s = subprocess.Popen(cmd, shell=shell_arg, executable=executable_arg, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, env=env) debug_stdout = collections.deque(maxlen=100) while 1: line = s.stdout.readline() if line: line = line.decode(errors='replace') debug_stdout.append(line) info(' ' + line.rstrip()) exitcode = s.poll() if exitcode is not None: for line in s.stdout: line = line.decode(errors='replace') debug_stdout.append(line) if exitcode is not None and exitcode != 0: error_msg = " ".join(cmd) if not isinstance(cmd, str) else cmd error_msg += "\n" error_msg += "".join(debug_stdout) s.communicate() s.stdout.close() raise subprocess.CalledProcessError(exitcode, cmd=cmd, output=error_msg) else: break s.communicate() s.stdout.close() # Check for problems not identified by shell return codes if checks: for check in checks: if not check(output_fpath, input_fpath): raise IOError("External command failed")
def send_file_for_igv(fpath): # handle igv.js Range header which it uses to request a subset of a BAM file: range_header = request.headers.get('Range', None) if not range_header: return send_file(fpath) m = re.search('(\d+)-(\d*)', range_header) if not m: error_msg = "ERROR: unexpected range header syntax: %s" % range_header log.err(error_msg) return error_msg size = os.path.getsize(fpath) offset = int(m.group(1)) length = int(m.group(2) or size) - offset with open(fpath, 'rb') as f: f.seek(offset) data = f.read(length) rv = Response(data, 206, mimetype="application/octet-stream", direct_passthrough=True) rv.headers.add( 'Content-Range', 'bytes {0}-{1}/{2}'.format(offset, offset + length - 1, size)) log.info("GET range request: %s-%s %s" % (m.group(1), m.group(2), fpath)) return rv
def cnv_to_bed(cnv_path, out_bed_path): with open(cnv_path) as fh: parse_fn = None header = next(fh).strip().split('\t') if header[0].startswith('##fileformat=VCF'): # Manta info(f'Detected {cnv_path} as caller "manta"') parse_fn = iter_manta else: for caller, hdr in header_by_caller.items(): if header == hdr: print( f'Parsing {cnv_path} as caller "{caller}" with header {hdr}' ) parse_fn = get_iter_cnv(header, parse_row_by_caller[caller]) if not parse_fn: critical(f'Cannot detect CNV file format in {cnv_path}') with open(out_bed_path, 'w') as out: writer = csv.writer(out, delimiter='\t') for i, call in enumerate(parse_fn(cnv_path)): if call: bed_row = call.get_bed_raw() writer.writerow(bed_row) if i == 0: print(bed_row) print('')
def adjust_ncpus_per_job(ncpus, max_ncpus_per_job=10, msg=''): """ Adjusting the number of cpus to a number below <max_ncpus_per_job>. Say, if we have more than 20 cpus on a node and only 1 batch, we should adjust to use only half of that for a batch, so that 2 different jobs (say, AMBER and COBALT) can be run in parallel, because using 20 cpus per one job is a waste. """ if ncpus > max_ncpus_per_job: # new_ncpus = ncpus factor = math.ceil(ncpus / max_ncpus_per_job) new_ncpus = ncpus // factor # while True: # factor += 1 # new_ncpus = ncpus // factor # print(f'ncpus: {ncpus}, factor: {factor}, new_ncpus: {new_ncpus}') # if new_ncpus < max_ncpus_per_job: # print(f'breaking') # break if not is_silent: logger.info( (msg if msg else 'The number of cpus per batch is ') + f'{ncpus} >{max_ncpus_per_job}. ' f'This is usually wasteful, so we are adjusting it ' f'to the number <={max_ncpus_per_job}: {new_ncpus} = {ncpus} // {factor}, so ' f'{factor} different rules can be run in parallel (say, AMBER and COBALT ' f'at the same time).') ncpus = new_ncpus return ncpus
def _load_bcbio_project(self, bcbio_project_path): proj = self._parsed_bcbio_projects_by_path.get(bcbio_project_path) if not proj: info(f'Loading project {bcbio_project_path}') proj = BcbioProject(bcbio_project_path, silent=True) self._parsed_bcbio_projects_by_path[bcbio_project_path] = proj return proj
def get_or_create_run(projects, parall_view=None): genomes = set([p.genome for p in projects]) if len(genomes) > 1: log.critical('Error: multiple genomes in projects: ' + str(genomes)) run = Run.find_by_projects(projects) if run and run.rerun_on_usercall: log.info() log.info('Rebuilding tree on usercall') build_tree(run) run.rerun_on_usercall = False db.session.commit() return run if run and not Run.is_ready(run): log.debug('Tree files do not exist, recreating run for projects ' + ', '.join(p.name for p in projects)) db.session.delete(run) db.session.commit() run = None if run: log.debug('Found run for ' + ', '.join([p.name for p in projects]) + ' with ID ' + str(run.id)) else: log.debug('Creating new run for projects ' + ', '.join(p.name for p in projects)) run = Run.create(projects, parall_view) log.debug('Done creating new run with ID ' + str(run.id)) return run
def extract_features(output_file, genome, only_canonical, high_confidence, coding_only, feature_types): """ For debug purposes """ debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') feature_types = feature_types or ['exon', 'CDS', 'stop_codon', 'transcript'] features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in feature_types) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') features_bed.saveas(output_file) debug(f'Saved features to {output_file}')
def get_padded_bed_file(work_dir, bed, padding, fai_fpath): genome_fpath = fai_fpath info('Making bed file for padded regions...') bedtools = which('bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(**locals()) output_fpath = intermediate_fname(work_dir, bed, 'padded') call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def get_padded_bed_file(work_dir, bed, padding, fai_fpath): genome_fpath = fai_fpath info('Making bed file for padded regions...') bedtools = which('bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(work_dir, bed, 'padded') call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def find_qc_files(self, dst_dir, exclude_files=None, include_files=None): """ Parses bcbio MultiQC file list and collects all QC files belonging to this batch :param dst_dir: destination directory where the QC files will be copied to :param exclude_files: not include files matching these patterns :param include_files: only include files matching these patterns :return: list of file paths copied into `new_mq_data_dir` """ mq_dir = join(self.parent_project.date_dir, 'multiqc') mq_filelist = join(mq_dir, 'list_files_final.txt') verify_file(mq_filelist, is_critical=True) # Cromwell? cwl_targz = join(mq_dir, 'multiqc-inputs.tar.gz') tar_f_by_fp = dict() if isfile(cwl_targz): info(f'Found CWL MultiQC output {cwl_targz}, extracting required QC files from the archive') if cwl_targz: tar = tarfile.open(cwl_targz) for member in tar.getmembers(): rel_fp = member.name if 'call-multiqc_summary/execution/qc/multiqc/' in rel_fp: rel_fp = rel_fp.split('call-multiqc_summary/execution/qc/multiqc/')[1] tar_f_by_fp[rel_fp] = tar.extractfile(member) qc_files_not_found = [] qc_files_found = [] with open(mq_filelist) as inp: for fp in [l.strip() for l in inp if l.strip()]: if fp == 'trimmed' or fp.endswith('/trimmed'): continue # back-compatibility with bcbio if exclude_files: if isinstance(exclude_files, str): exclude_files = [exclude_files] if any(re.search(ptn, fp) for ptn in exclude_files): continue if include_files: if isinstance(include_files, str): include_files = [include_files] if not any(re.search(ptn, fp) for ptn in include_files): continue new_fp = _extract_qc_file(fp, dst_dir, self.parent_project.final_dir, tar_f_by_fp) if not new_fp: qc_files_not_found.append(fp) continue else: qc_files_found.append(new_fp) if qc_files_not_found: warn('-') warn(f'Some QC files from list {mq_filelist} were not found:' + ''.join('\n ' + fpath for fpath in qc_files_not_found)) return qc_files_found
def run(cmd, output_fpath=None, input_fpaths=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath if input_fpaths is not None: if isinstance(input_fpaths, str): input_fpaths = [input_fpaths] for fpath in input_fpaths: verify_file(fpath, is_critical=True) env = _get_env(env_vars) # info('env: ' + str(env)) if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpaths): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpaths) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpaths) else: _try_run(cmd, output_fpath, input_fpaths) else: _try_run(cmd, None, input_fpaths)
def ungzip_if_needed(cnf, fpath, silent=False): if fpath.endswith('.gz'): fpath = fpath[:-3] if not file_exists(fpath) and file_exists(fpath + '.gz'): gz_fpath = fpath + '.gz' cmdline = 'gunzip -c {gz_fpath} > {fpath}'.format(**locals()) res = run_simple(cmdline) if not silent: info() if not res: return None return fpath
def genotype(samples, snp_bed, parall_view, work_dir, output_dir, genome_build): genome_fasta_file = get_ref_fasta(genome_build) info('** Running VarDict ** ') vcfs = parall_view.run( _vardict_pileup_sample, [[s, work_dir, output_dir, genome_fasta_file, snp_bed] for s in samples]) vcf_by_sample = OrderedDict(zip([s.name for s in samples], vcfs)) info('** Finished running VarDict **') return vcf_by_sample
def ungzip_if_needed(cnf, fpath, silent=False): if fpath.endswith('.gz'): fpath = fpath[:-3] if not file_exists(fpath) and file_exists(fpath + '.gz'): gz_fpath = fpath + '.gz' cmdline = 'gunzip -c {gz_fpath}'.format(**locals()) res = run(cmdline, output_fpath=fpath) if not silent: info() if not res: return None return fpath
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir(adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def plot_heatmap(pairwise, run_dir, title): df = pd.DataFrame(data=pairwise) log.info(df) # Generate a mask for the upper triangle + main diagonale mask = np.zeros_like(df, dtype=np.bool) mask[np.triu_indices_from(mask, k=1)] = True # Set up the matplotlib figure n = len(pairwise) figsize = (n / 2, n * 7 / 20) log.info(f'Saving figure of size {figsize}') f, ax = plt.subplots(figsize=figsize) # For 20 samples, take 10x7 if title: ax.set_title(title) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio g = sns.heatmap(df, vmin=0.5, vmax=1, center=0.75, mask=mask, cmap=cmap, annot=True, fmt='.2f', ax=ax, annot_kws={'size': 8}) g.set_yticklabels(g.get_yticklabels(), rotation=0, fontsize=7) g.set_xticklabels(g.get_xticklabels(), rotation=90, fontsize=7) sns.set(font_scale=2) matplotlib.pyplot.subplots_adjust(left=0.2, right=1, top=0.93, bottom=0.29) png_file = join(run_dir, str_to_filename(title) + '.png') if isfile(png_file): os.remove(png_file) matplotlib.pyplot.savefig(png_file) if isfile(png_file): log.info('') log.info('Saved heatmap into ' + adjust_path(png_file)) try: from az.webserver.exposing import convert_gpfs_path_to_url except ImportError: pass else: url = convert_gpfs_path_to_url(png_file) if url: log.info(' url: ' + url) return url return png_file
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def main(host, port): clearup.HOST_IP = host clearup.POST = port log.init(True, join(DATA_DIR, 'log_server.txt'), save_previous=True) os.environ['FLASK_DEBUG'] = '1' # log_path = join(DATA_DIR, 'flask.log') # handler = RotatingFileHandler(log_path, maxBytes=10000, backupCount=10) # handler.setLevel(logging.INFO) # app.logger.addHandler(handler) http_server = WSGIServer((host, port), app, handler_class=WebSocketHandler) log.info('Starting a webserver at ' + host + ':' + str(port)) http_server.serve_forever()
def find_sv_vcf(self, silent=False, caller=False): caller = caller or self.sv_caller sv_prio = join(self.tumors[0].dirpath, f'{self.name}-sv-prioritize-{caller}.vcf.gz') sv_unprio = join(self.tumors[0].dirpath, f'{self.name}-{caller}.vcf.gz') # CWL? sv_cwl_prio = join(self.parent_project.date_dir, f'{self.tumors[0].name}-{caller}-prioritized.vcf.gz') sv_cwl_unprio = join(self.parent_project.date_dir, f'{self.tumors[0].name}-{caller}.vcf.gz') if isfile(sv_prio): verify_file(sv_prio, is_critical=True) if not silent: info(f'Found SV VCF in <tumor>/<batch>-sv-prioritize-{caller}.vcf.gz: ' + sv_prio) self.sv_vcf = sv_prio elif isfile(sv_unprio): verify_file(sv_unprio, is_critical=True) if not silent: info(f'Found SV VCF in <tumor>/<batch>-{caller}.vcf.gz: ' + sv_unprio) self.sv_vcf = sv_unprio elif isfile(sv_cwl_prio): verify_file(sv_cwl_prio, is_critical=True) if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}-prioritized.vcf.gz: ' + sv_cwl_prio) self.sv_cwl_prio = sv_cwl_prio elif isfile(sv_cwl_unprio): verify_file(sv_cwl_unprio, is_critical=True) if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}.vcf.gz: ' + sv_cwl_prio) self.sv_vcf = sv_cwl_unprio elif not silent: warn(f'Could not find SV VCF file for batch {self.name}, caller {caller} neither under sample folder as ' f'<tumor>/<batch>(-sv-prioritize)-{caller}.vcf.gz (conventional bcbio), ' f'nor in the project folder as project/<tumor>-{caller}(-prioritized).vcf.gz (CWL bcbio).')
def load_bcbio_project(bcbio_dir, name=None, use_callable=False): log.info('-' * 70) log.info('Loading project into the fingerprints database from ' + bcbio_dir) log.info('-' * 70) log.info() bcbio_proj = BcbioProject() bcbio_proj.load_from_bcbio_dir(bcbio_dir, project_name=name, proc_name='clearup') _add_project( bam_by_sample={s.name: s.bam for s in bcbio_proj.samples}, project_name=name or bcbio_proj.project_name, bed_file=bcbio_proj.coverage_bed, use_callable=use_callable, data_dir=bcbio_proj.final_dir, genome=bcbio_proj.genome_build, min_depth=DEPTH_CUTOFF, depth_by_sample={ s.name: s.get_avg_depth() for s in bcbio_proj.samples }, )
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir( adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical('Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.') cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format(**locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def _get_approved_genes_by_kind(approved_genes, kind): if not approved_genes: return 'NOT FOUND' if len(approved_genes) > 1: approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id] if len(approved_genes_same_ucsc) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_ucsc) == 1: if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom): err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id, print_date=False) return approved_genes_same_ucsc[0].name # Ok, no genes with same ucsc id, or not the same chromosome for them. approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom] if len(approved_genes_same_chrom) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' + db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_chrom) == 1: g = approved_genes_same_chrom[0] info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom ' + db_chrom + ', picking it', print_date=False) if _check_gene_symbol(g, gene_symbol, db_id, db_chrom): return g.name else: return 'NOT FOUND' if len(approved_genes_same_chrom) == 0: err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom ' + db_chrom + '', print_date=False) return 'NOT FOUND' if len(approved_genes) == 1: if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom): info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as ' + kind + ')', print_date=False) return approved_genes[0].name return 'NOT FOUND'
def parse_hgnc_chrom(chrom): if chrom in ['reserved', 'c10_B']: return None CHROMS = ['Y', 'X', 'mitochondria'] for i in range(22, 0, -1): CHROMS.append(str(i)) for c in CHROMS: if chrom.startswith(c): if c == 'mitochondria': return 'chrM' return 'chr' + c info(' Notice: cannot parse chromosome ' + chrom) return None
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter('--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate( input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath env = _get_env(env_vars) if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpath) else: _try_run(cmd, output_fpath, input_fpath) else: _try_run(cmd, None, input_fpath)
def _load_datasets(subdirs): vcf_by_project_by_genome = defaultdict(dict) # vcf_by_label = dict() # all_bed_files = [] # project_names = [] datasets = [] for subdir in subdirs: dataset = Dataset() if ':' in subdir: subdir, dataset.genome = subdir.split(':') else: dataset.genome = 'hg19' dir_path = subdir if glob(join(dir_path, '*.vcf.gz')): log.info(f'Found .vcf.gz files in directory {dir_path}') # Simple directory with VCF files and an optional BED file? dataset.name = subdir.replace('/', '__') if glob(join(dir_path, '*.bed')): dataset.bed_file = glob(join(dir_path, '*.bed'))[0] for vcf_fpath in glob(join(dir_path, '*.vcf.gz')): label = join(subdir, basename(splitext_plus(vcf_fpath)[0])).replace( '/', '__') dataset.vcf_by_label[label] = vcf_fpath else: log.info( f'Not found any .vcf.gz files in directory {dir_path}. Checking if that\'s a bcbio folder.' ) # Bcbio directory? bcbio_proj = BcbioProject() bcbio_proj.load_from_bcbio_dir(subdir, proc_name='clearup') dataset.name = bcbio_proj.project_name dataset.genome = bcbio_proj.genome_build for s in bcbio_proj.samples: vcf_file = s.find_raw_vcf() if vcf_file: dataset.vcf_by_label[bcbio_proj.project_name + '__' + s.name] = vcf_file if bcbio_proj.coverage_bed: dataset.bed_file = bcbio_proj.coverage_bed datasets.append(dataset) return datasets
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath bed_annotation = which('annotate_bed.py') if not bed_annotation: bed_annotation = which('bed_annotation') critical( 'Error: bed_annotation not found in PATH, please install `conda install -c vladsaveliev bed_annotation`.' ) cmdline = '{bed_annotation} {target_bed} -g {genome_build} -o {output_fpath}'.format( **locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def detect_run_info_in_config_dir(config_dir): run_info_fpaths_in_config = [ abspath(join(config_dir, fname)) for fname in os.listdir(config_dir) if fname.startswith('run_info') and fname.endswith('.yaml') ] if len(run_info_fpaths_in_config) > 1: critical( 'More than one YAML file containing run_info in name found in the config ' 'directory ' + config_dir + ': ' + ' '.join(run_info_fpaths_in_config)) if len(run_info_fpaths_in_config) == 0: return None run_cnf = verify_file(run_info_fpaths_in_config[0], is_critical=True) info('Using run configuration from the config directory ' + run_cnf) return run_cnf
def add_user_call(project_names_line, sample_id): log.info('Adding user call for ' + str(sample_id)) edit_sample_id = request.form['editSampleId'] sample = Sample.query.get(edit_sample_id) if not sample: log.error('Sample with ID=' + str(edit_sample_id) + ' not found') return redirect( url_for('closest_comparison_page', project_names_line=project_names_line, sample_id=sample_id)) # snp = sample.snps.join(Location).filter(Location.rsid==request.form['rsid']).first() snp = sample.snps.filter(SNP.rsid == request.form['rsid']).first() usercall = request.form['usercall'] msg = 'ClearUp: usercall for sample ' + sample.name + ' of run ' + project_names_line + ' added:\n' msg += 'SNP {}:{} {} {}|{}'.format(str(snp.location.chrom), str(snp.location.pos), snp.location.rsid, snp.allele1, snp.allele2) if snp.usercall: msg += ', previous usercall ' + snp.usercall msg += ', setting usercall ' + usercall snp.usercall = usercall db.session.commit() # Forcing rebuilding the trees of affected runs for run in Run.query.all(): if sample.project in run.projects: if any(l for l in run.locations if l.rsid == snp.rsid): # log.debug('Removing tree file ' + run.tree_file_path()) # os.rename(run.fasta_file_path(), run.fasta_file_path() + '.bak') # os.rename(run.tree_file_path(), run.tree_file_path() + '.bak') run.rerun_on_usercall = True db.session.commit() log.send_email(msg, subj='ClearUp usercall', only_me=True) return render_closest_comparison_page( project_names_line, sample_id, selected_idx=request.form['snpIndex'], rerun_if_usercall=False)
def write_all_features(genes, output_fpath, canon_only, cds_only=False, seq2c_cds=False): regions = [] already_added_gene_features = set() transcripts = [] for g in genes: _canon_tx = [] for t in g.transcripts: if not canon_only or t.is_canonical: _canon_tx.append(t) if seq2c_cds and len(_canon_tx) > 1: # Need to select single one for Seq2C CDS file transcripts.append(max(_canon_tx, key=Transcript.get_length_key)) else: transcripts.extend(_canon_tx) for t in sorted(transcripts, key=lambda _tr: _tr.get_key()): to_add_gene = (not cds_only and all(t2.coding for t2 in t.gene.transcripts if (t2.is_canonical or not canon_only)) # all other transcripts for this gene are coding - we don't report Gene features for ncRNA and t.gene not in already_added_gene_features # and gene is not already added and (len(t.gene.canonical_transcripts) == 1 or len(t.gene.transcripts) == 1)) # and has one canonical or non-canonical transcript to report if to_add_gene: # skip gene feature for all ncRNA, because there can be multi-domain ncRNA located in different places with the same gene name regions.append(t.gene) already_added_gene_features.add(t.gene) if t.exons: if not cds_only: regions.append(t) for e in t.exons: if not cds_only or t.coding: regions.append(e) regions = sorted(regions, key=lambda r: r.get_key()) info('Writing ' + str(len(regions)) + ' regions') with open(adjust_path(output_fpath), 'w') as all_out: for r in regions: if cds_only: all_out.write('\t'.join([r.transcript.gene.chrom, '{}'.format(r.start) if r.start is not None else '.', '{}'.format(r.end) if r.end is not None else '.', r.transcript.gene.name or '.']) + '\n') else: all_out.write(r.__str__())
def read_approved_genes(synonyms_fpath): approved_gene_by_name = dict() approved_gnames_by_prev_gname = defaultdict(list) approved_gnames_by_synonym = defaultdict(list) info('Parsing HGNC database ' + synonyms_fpath + '...') with open(synonyms_fpath) as f: i = 0 for l in f: if l and not l.startswith('#'): approved_gn, prev_names, synonyms, hgnc_chrom, ensembl_id, ucsc_id = l.replace('\n', '').split('\t') if hgnc_chrom: hgnc_chrom = parse_hgnc_chrom(hgnc_chrom) approved_gene = ApprovedGene(approved_gn, prev_names, synonyms, hgnc_chrom, ucsc_id, ensembl_id) approved_gene_by_name[approved_gn] = approved_gene for gn in prev_names.split(', '): if gn: approved_gnames_by_prev_gname[gn].append(approved_gene) for gn in synonyms.split(', '): if gn: approved_gnames_by_synonym[gn].append(approved_gene) i += 1 info(' Processed ' + str(i) + ' lines from ' + synonyms_fpath) info() return approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def load_bcbio_cnf(config_dir, silent=False): all_yamls = [ abspath(join(config_dir, fname)) for fname in listdir(config_dir) if fname.endswith('.yaml')] if len(all_yamls) == 0: critical('No YAML file in the config directory.') bcbio_yamls = [] for fpath in all_yamls: if not fpath.endswith('-template.yaml'): if 'details' in load_yaml_config(fpath): bcbio_yamls.append(fpath) if len(bcbio_yamls) == 0: critical('No bcbio YAMLs found in the config directory: ' + config_dir + ' (only ' + ', '.join(map(basename, all_yamls)) + ' which do not have the "details" section)') if len(bcbio_yamls) > 1: critical('More than one bcbio YAML file found in the config directory ' + config_dir + ': ' + ' '.join(bcbio_yamls)) yaml_fpath = bcbio_yamls[0] if not silent: info('Using bcbio YAML config: ' + yaml_fpath) return load_yaml_config(yaml_fpath), yaml_fpath
def detect_bcbio_dir(input_dir, silent=False): """ :param input_dir: `config` dir, or `final` dir, or datestamp dir, or the directory root to `final` :return: (config_dir, final_dir, date_dir) """ config_dir, final_dir, date_dir = None, None, None input_dir = abspath(input_dir) # We are inside `*final*` if 'final' in basename(input_dir): # allow prefixes and postfixes final_dir = input_dir root_dir = dirname(final_dir) config_dir = join(root_dir, 'config') if not isdir(config_dir): err(f'Are you running on a bcbio output?\n' f'The input folder appear to be `final` ({input_dir}), ' f'however can\'t find `config` directory at the same level ({config_dir})') raise NoConfigDirException('No config dir') # We are inside `config` elif basename(input_dir) == 'config': config_dir = input_dir # We are in a parent dir to `config` (and possibly `final`, called otherwise) elif isdir(join(input_dir, 'config')): config_dir = join(input_dir, 'config') # We are inside a date dir elif isdir(abspath(join(input_dir, pardir, pardir, 'config'))): final_dir = abspath(join(input_dir, pardir)) root_dir = abspath(join(input_dir, pardir, pardir)) config_dir = abspath(join(root_dir, 'config')) # if 'final' not in basename(final_dir): # err(f'Are you running on a bcbio output?\n' # f'Found config directory 2 level up at {config_dir}, assuming your input {input_dir} ' # f'is a datestamp directory. However, the parent directory is not called `*final*`') # raise NoConfigDirException('No final dir') else: if not silent: err(f'Are you running on a bcbio output?\n' f'{input_dir} is not `config` or `*final*`, and ' f'can\'t find a `config` directory at {join(input_dir, "config")}, or {abspath(join(input_dir, pardir, "config"))}.' f'Make sure that you changed to a bcbio root or final directory, or provided it as a first argument.') raise NoConfigDirException('No config dir') if not silent: if not silent: info(f'Bcbio config directory: ' + config_dir) if final_dir: if not silent: info('"final" directory: ' + final_dir) if date_dir: if not silent: info('"datestamp" directory: ' + date_dir) return config_dir, final_dir, date_dir
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def check_md5(work_dir, fpath, file_type, silent=False): md5_fpath = join(work_dir, file_type + '_md5.txt') new_md5 = md5(fpath) info('md5 of ' + fpath + ' is ' + str(new_md5)) prev_md5 = None if isfile(md5_fpath): with open(md5_fpath) as f: prev_md5 = f.read() else: info('Previous md5 file ' + md5_fpath + ' does not exist') info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5)) if prev_md5 == new_md5: if not silent: debug('Reusing previous ' + file_type.upper() + ' files.') return True else: if not silent: info('Pre-processing input ' + file_type.upper() + ' file') if prev_md5: if not silent: info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5)) info('New ' + file_type.upper() + ' md5: ' + str(new_md5)) with open(md5_fpath, 'w') as f: f.write(str(new_md5)) return False
def _approve(gene_by_name, synonyms_fpath): approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym = \ read_approved_genes(synonyms_fpath) not_approved_gene_names = list() gene_after_approving_by_name = OrderedDict() total_approved = 0 total_not_approved = 0 j = 0 for g in gene_by_name.values(): if len(g.exons) == 0: continue gene_after_approving_by_name[g.name] = g if is_approved_symbol(g.name, approved_gene_by_name): gene_after_approving_by_name[g.name] = g total_approved += 1 else: not_approved_gene_names.append(g.name) total_not_approved += 1 j += 1 if j % 1000 == 0: info('processed ' + str(j / 1000) + 'k genes...') info('-----') info('Total: ' + str(j)) if approved_gene_by_name: info('Total approved: ' + str(total_approved)) info('Total not approved: ' + str(total_not_approved)) info() info('Saving genes...') gene_features = 0 features_counter = defaultdict(int) biotypes_counter = defaultdict(int) no_exon_gene_num = 0 filtered_gene_after_approving_by_name = OrderedDict() for g in gene_after_approving_by_name.values(): if len(g.exons) == 0: no_exon_gene_num += 1 else: filtered_gene_after_approving_by_name[g.name] = g gene_features += 1 features_counter[g.feature] += 1 biotypes_counter[g.biotype] += 1 for e in g.exons: features_counter[e.feature] += 1 if e.feature == 'exon': e.feature = 'Exon' elif e.feature == 'stop_codon': e.feature = 'CDS' else: e.feature = e.feature[0].upper() + e.feature[1:] info('Skipped {} genes with no sub-features.'.format(no_exon_gene_num)) info('Approved {} genes, including:'.format(gene_features)) info(' Gene: {}'.format(features_counter['Gene'])) info(' Multi_Gene: {}'.format(features_counter['Multi_Gene'])) info('') info('Out of total: {} protein coding genes, {} ncRNA genes, including:'.format( biotypes_counter['protein_coding'], sum(biotypes_counter.values()) - biotypes_counter['protein_coding'])) for bt, cnt in biotypes_counter.items(): if bt != 'protein_coding': err(' ' + bt + ': ' + str(cnt)) info() if ALL_EXONS: info('Found {} exons.'.format(features_counter['exon'])) else: info('Also found {} CDS, {} stop codons, and {} ncRNA exons.'.format( features_counter['CDS'], features_counter['stop_codon'], features_counter['exon'])) return filtered_gene_after_approving_by_name, not_approved_gene_names
def get_approved_gene_symbol(approved_gene_by_name, approved_gnames_by_prev_gname, approved_gnames_by_synonym, gene_symbol, db_id='', db_chrom='', indent=''): if gene_symbol in approved_gene_by_name: if _check_gene_symbol(approved_gene_by_name[gene_symbol], gene_symbol, db_id, db_chrom): return approved_gene_by_name[gene_symbol].name, None info(indent + 'Gene name ' + gene_symbol + ' is not approved, searching for an approved version... ', ending='', print_date=False) def _get_approved_genes_by_kind(approved_genes, kind): if not approved_genes: return 'NOT FOUND' if len(approved_genes) > 1: approved_genes_same_ucsc = [g for g in approved_genes if g.db_id == db_id] if len(approved_genes_same_ucsc) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id + ': ' + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_ucsc) == 1: if _check_gene_symbol(approved_genes_same_ucsc[0], gene_symbol, db_id, db_chrom): err(' found approved gene for ' + gene_symbol + ' (as ' + kind + ') with ucsc_id ' + db_id, print_date=False) return approved_genes_same_ucsc[0].name # Ok, no genes with same ucsc id, or not the same chromosome for them. approved_genes_same_chrom = [g for g in approved_genes if g.chrom == db_chrom] if len(approved_genes_same_chrom) > 1: err(' ERROR: multiple approved gene names for ' + gene_symbol + ' (as ' + kind + ') with chrom ' + db_chrom + ', '.join(g.name for g in approved_genes_same_ucsc) + '', print_date=False) return 'AMBIGUOUS' if len(approved_genes_same_chrom) == 1: g = approved_genes_same_chrom[0] info(' only ' + g.name + ' for ' + gene_symbol + ' (as ' + kind + ') has the same chrom ' + db_chrom + ', picking it', print_date=False) if _check_gene_symbol(g, gene_symbol, db_id, db_chrom): return g.name else: return 'NOT FOUND' if len(approved_genes_same_chrom) == 0: err(' ERROR: no approved gene names for ' + gene_symbol + ' (as ' + kind + ') with same chrom ' + db_chrom + '', print_date=False) return 'NOT FOUND' if len(approved_genes) == 1: if _check_gene_symbol(approved_genes[0], gene_symbol, db_id, db_chrom): info(' found approved gene symbol for ' + gene_symbol + ': ' + approved_genes[0].name + ' (as ' + kind + ')', print_date=False) return approved_genes[0].name return 'NOT FOUND' res = _get_approved_genes_by_kind(approved_gnames_by_prev_gname.get(gene_symbol), 'prev') if res == 'AMBIGUOUS': return None, 'AMBIGUOUS\tAS PREV' elif res == 'NOT FOUND': res = _get_approved_genes_by_kind(approved_gnames_by_synonym.get(gene_symbol), 'synonym') if res == 'AMBIGUOUS': return None, res + '\tAS SYNONYM' if res == 'NOT FOUND': err(' not found.', print_date=False) return None, res else: info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as synonym): ' + res, print_date=False) return res, None else: info(indent + 'Finally found approved gene for ' + gene_symbol + ' (as prev): ' + res, print_date=False) return res, None
def main(): description = ''' The script writes all RefSeq features for requested genome build, and generates 3 files: all_features.{genome}.bed: Gene (protein_coding) Transcript (protein_coding and ncRNA) Exon (ncRNA) CDS (protein_coding) all_features.{genome}.canon.bed: The same, but taking canonical (or longest) transcripts only CDS.{genome}.bed CDS, canonical (or longest) transcripts only Usage: ' + __file__ + ' hg19 [db.gtf] And db.gtf is either of the following: Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; ... RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz NC_000001.10 RefSeq region 1 249250621 . + . ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA NC_000001.10 BestRefSeq gene 11874 14409 . + . ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true NC_000001.10 BestRefSeq transcript 11874 14409 . + . ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2 NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2 ... RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1 ... See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols''' options = [ # (['--bam'], dict(dest='bam', help='path to the BAM file to analyse',)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') genome_name = args[0] chrom_order = ref.get_chrom_order(genome_name) canonical_transcripts_ids = ref.get_canonical_transcripts_ids(genome_name) if len(args) > 1: input_fpath = verify_file(args[1]) else: input_fpath = ba.get_refseq_gene(genome_name) output_dirpath = ba.get_refseq_dirpath() synonyms_fpath = ba.get_hgnc_gene_synonyms() not_approved_fpath = join(output_dirpath, 'not_approved.txt') info('Reading the features...') with open_gzipsafe(input_fpath) as inp: if input_fpath.endswith('.gtf') or input_fpath.endswith('.gtf.gz'): gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_dirpath, chrom_order) elif input_fpath.endswith('.gff3') or input_fpath.endswith('.gff3.gz'): gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_dirpath, chrom_order) else: gene_by_name_and_chrom = _proc_ucsc(inp, output_dirpath, chrom_order) if synonyms_fpath and DO_APPROVE: gene_by_name_and_chrom, not_approved_gene_names = _approve(gene_by_name_and_chrom, synonyms_fpath) info('') info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.') if not_approved_fpath: with open(not_approved_fpath, 'w') as f: f.write('#Searched as\tStatus\n') f.writelines((l + '\n' for l in not_approved_gene_names)) info('Saved not approved to ' + not_approved_fpath) info('Found:') info(' ' + str(len(gene_by_name_and_chrom)) + ' genes') genes = gene_by_name_and_chrom.values() coding_genes = [g for g in genes if any(t.coding for t in g.transcripts)] coding_transcripts = [t for g in coding_genes for t in g.transcripts if t.coding] rna_genes = [g for g in genes if all(not t.coding for t in g.transcripts)] rna_transcripts = [t for g in genes for t in g.transcripts if not t.coding] mixed_genes = [g for g in genes if any(not t.coding for t in g.transcripts) and any(t.coding for t in g.transcripts)] info(' ' + str(len(coding_genes)) + ' coding genes') info(' ' + str(len(coding_transcripts)) + ' coding transcripts') info(' ' + str(len(rna_genes)) + ' RNA genes') info(' ' + str(len(rna_transcripts)) + ' RNA transcripts') info(' ' + str(len(mixed_genes)) + ' genes with both coding and RNA transcripts') for g in coding_genes: g.coding = True g.biotype = 'protein_coding' for g in rna_genes: g.coding = False g.biotype = 'RNA' info() # info('Choosing genes with exons...') # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] info('Choosing canonical...') canon_genes = choose_canonical(genes, canonical_transcripts_ids) info() info('Sorting and printing all regions...') all_features_fpath = ba.get_all_features(genome_name) write_all_features(genes, all_features_fpath, canon_only=False) all_features_fpath = bgzip_and_tabix(all_features_fpath, tabix_parameters='-p bed') info() info('Sorting and printing canonical regions...') canon_output_fpath = ba.get_all_features_canonical(genome_name, gzip=False) write_all_features(canon_genes, canon_output_fpath, canon_only=True) canon_output_fpath = bgzip_and_tabix(canon_output_fpath, tabix_parameters='-p bed') info() info('Sorting and printing canonical CDS...') cds_output_fpath = ba.get_cds(genome_name) write_all_features(canon_genes, cds_output_fpath, canon_only=True, cds_only=True) # info() # info('Sorting and printing CDS for Seq2C (unique transcript per gene)...') # seq2c_output_fpath = ga.get_seq2c_cds(genome_name) # write_all_features(canon_genes, seq2c_output_fpath, canon_only=True, cds_only=True, seq2c_cds=True) info() info('Saved all regions to\n ' + all_features_fpath + '\n ' + canon_output_fpath + '\n ' + cds_output_fpath + '\n ' + seq2c_output_fpath)
def _proc_ensembl_gtf(inp, out, chr_order, additional_feature_list=None): if additional_feature_list is None: additional_feature_list = [] info('additional_feature_list = ' + str(additional_feature_list)) gene_by_name = OrderedDict() gene_by_id = OrderedDict() info('Parsing Ensembl input...') total_lines = 0 total_non_coding_genes = 0 for l in inp: if l and not l.startswith('#'): chrom, _, feature, start, end, _, strand, _, props_line = l.replace('\n', '').split('\t') # if is_local(): # if chrom != '21': # continue total_lines += 1 if total_lines % 1000 == 0: info(str(total_lines / 1000) + 'k lines, ' + str(len(gene_by_name)) + ' genes found') sys.stdout.flush() try: _prop_dict = dict((t.strip().split(' ')[0], ' '.join(t.strip().split(' ')[1:])) for t in props_line.split(';') if t.strip()) except ValueError: sys.stderr.write(format_exc()) sys.stderr.write(l) gene_symbol = _rm_quotes(_prop_dict['gene_name']) gene_id = _rm_quotes(_prop_dict['gene_id']) gene_biotype = _rm_quotes(_prop_dict['gene_biotype']) gene_source = _rm_quotes(_prop_dict['gene_source']) # if gene_symbol == 'PTENP1': # sys.stderr.write('PTENP1\n') if not ALL_EXONS and gene_biotype not in [ 'protein_coding', 'nonsense_mediated_decay', 'non_stop_decay', 'processed_transcript', 'polymorphic_pseudogene', 'sense_intronic', 'sense_overlapping', 'antisense', ] and not any(b in gene_biotype for b in ['RNA', 'IG_', 'TR_']): total_non_coding_genes += 1 continue full_feature_list = ['gene', 'CDS', 'stop_codon', 'exon'] + additional_feature_list if ALL_EXONS: full_feature_list = ['gene', 'exon'] # sys.stderr.write('Full feature list: ' + str(full_feature_list) + '\n') if feature not in full_feature_list: continue start, end = int(start) - 1, int(end) if int(end) <= int(start): info('Error: start > end: ' + l) continue chrom = parse_ensembl_chrom(chrom) if not chrom: continue if feature == 'gene': # assert gene_biotype == biotype, 'Gene: gene_biotype "' + gene_biotype + '" # do not match biotype "' + biotype + '" for ' + gene_symbol gene = Gene(chrom, chr_order.get(chrom), start, end, gene_symbol, strand, gene_biotype, gene_id, gene_source) if gene.name in gene_by_name: prev_gene = gene_by_name[gene.name] if gene.source != prev_gene.source: err(' Duplicated gene in different databases:') err(' This: ' + gene.__repr__()) err(' Prev: ' + prev_gene.__repr__()) # answer = raw_input('Which one to pick? This (1), prev (2), longest (Enter): ') # # if answer == '1' or answer == '' and gene.end - gene.start > # prev_gene.end - prev_gene.start: # del gene_by_name[prev_gene.name] # del gene_by_id[prev_gene.db_id] # # else: # continue if gene.source == 'ensembl' or prev_gene.source == 'havana': del gene_by_name[prev_gene.name] del gene_by_id[prev_gene.db_id] err(' Picking up this one.') if prev_gene.source == 'ensembl' or gene.source == 'havana': err(' Picking up previous one.') continue else: err(' Duplicated gene in ' + gene.source + ':') err(' ' + gene.__repr__()) prev_gene.start = min(prev_gene.start, gene.start) prev_gene.end = max(prev_gene.end, gene.end) prev_gene.feature = 'Multi_Gene' continue err('') gene_by_name[gene_symbol] = gene gene_by_id[gene_id] = gene elif feature in ['CDS', 'stop_codon'] \ or feature == 'exon' and ('RNA' in gene_biotype or ALL_EXONS) \ or feature in additional_feature_list: assert gene_symbol in gene_by_name, 'Error: ' + feature + ' record before gene record ' + \ gene_symbol + ', ' + gene_id + '; gene_by_name: ' + str(gene_by_name.keys()) gene = gene_by_name[gene_symbol] if gene.gene_id == gene_id: assert gene_biotype == gene.biotype, feature + ': gene_biotype "' + gene_biotype + \ '" do not match biotype "' + gene.biotype + '" for ' + gene_symbol exon = Exon(gene, start, end, gene_biotype, feature) gene.exons.append(exon) info() info( 'Processed ' + str(total_lines) + ' lines, ' + str(total_non_coding_genes) + ' non-coding genes skipped, ' + str(len(gene_by_name)) + ' coding genes found') info() return gene_by_name
def choose_canonical(genes, canonical_transcripts_ids): not_found_in_canon_coding_num = 0 not_found_in_canon_coding_num_one_transcript = 0 not_found_in_canon_rna_num = 0 not_found_in_canon_other_num = 0 many_canon_coding_num = 0 many_canon_rna_num = 0 many_canon_other_num = 0 canon_genes = [] for g in genes: _canon_tx = [] for t in g.transcripts: if t.transcript_id in canonical_transcripts_ids: t.is_canonical = True _canon_tx.append(t) if len(_canon_tx) > 1: if any(t.coding for t in g.transcripts): many_canon_coding_num += 1 # Checking overlapping for i, t1 in enumerate(_canon_tx): for j in range(i + 1, len(_canon_tx)): t2 = _canon_tx[j] if t1.start <= t2.start < t1.end or t1.start <= t2.end < t1.end: err('Transcripts ' + t1.transcript_id + ' (' + str(t1.start) + ':' + str(t1.end) + ') and ' + t2.transcript_id + ' (' + str(t2.start) + ':' + str(t2.end) + ') ' + ' in gene ' + g.name + ' ' + g.chrom + ' overlap') elif any(not t.coding for t in g.transcripts): many_canon_rna_num += 1 else: many_canon_other_num += 1 if len(_canon_tx) == 0: if any(t.coding for t in g.transcripts): not_found_in_canon_coding_num += 1 if len(g.transcripts) == 1: not_found_in_canon_coding_num_one_transcript += 1 # longest_t = max(g.transcripts, key=Transcript.length) # longest_t.is_canonical = True elif any(not t.coding for t in g.transcripts): not_found_in_canon_rna_num += 1 else: not_found_in_canon_other_num += 1 g.canonical_transcripts = [t for t in g.transcripts if t.is_canonical] if len(g.canonical_transcripts) > 0: if g.canonical_transcripts: canon_genes.append(g) info('Coding genes with canonical transcripts: ' + str(sum(1 for g in canon_genes if any(t.coding for t in g.canonical_transcripts)))) info('Coding canonical transcripts: ' + str(sum(1 for g in canon_genes for t in g.canonical_transcripts if t.coding))) info('RNA genes with canonical transcripts: ' + str(sum(1 for g in canon_genes if any(not t.coding for t in g.canonical_transcripts)))) info('RNA canonical transcripts: ' + str(sum(1 for g in canon_genes for t in g.canonical_transcripts if not t.coding))) info() info('Coding genes with no canonical transcripts (picking longest out of the rest): ' + str(not_found_in_canon_coding_num)) info('RNA genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_rna_num)) info('Other genes with no canonical transcripts (skipping all): ' + str(not_found_in_canon_other_num)) info('Coding genes with many canonical transcripts (picking longest): ' + str(many_canon_coding_num)) info('RNA genes with many canonical transcripts (keeping all): ' + str(many_canon_rna_num)) info('Other genes with many canonical transcripts (keeping all): ' + str(many_canon_other_num)) return canon_genes
def run_snakemake(snakefile, conf, jobs=None, output_dir=None, forcerun=None, unlock=False, dryrun=False, target_rules=None, cluster=None, cluster_cmd=None, log_dir=None, dag=None, report=None, restart_times=None): conf['total_cores'] = jobs ######################### #### Setting cluster #### ######################### cluster_param = '' cluster_log_dir = '' if cluster or cluster_cmd: assert log_dir, 'For cluster run, must also specify log_dir' if cluster_cmd: cluster_param = f' --cluster "{cluster_cmd}"' else: cluster_log_dir = safe_mkdir(join(log_dir, 'cluster')) cluster_param = make_cluster_cmdl(cluster_log_dir, 'umccrise') ########################## #### Preparing config #### ########################## if log_dir: safe_mkdir(log_dir) conf_f = open(join(log_dir, '.conf.yaml'), 'w') else: conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False) yaml.dump(conf, conf_f) conf_f.close() ############################### #### Building command line #### ############################### if forcerun: forcerun = " ".join(forcerun.split(',')) cmd = ( f'snakemake ' f'{" ".join(flatten([target_rules])) if target_rules else ""} ' + f'--snakefile {snakefile} ' f'--printshellcmds ' f'{"--dryrun " if dryrun else ""}' f'{"--dag " if dag else ""}' f'{f"--report {report} " if report else ""}' f'{f"--directory {output_dir} " if output_dir else ""}' f'{f"-j {jobs} " if jobs else ""}' f'--rerun-incomplete ' f'{f"--restart-times {restart_times} " if restart_times else ""}' f'{cluster_param} ' f'--configfile {conf_f.name} ' + f'{"--dag " if dag else ""}' f'{f"--forcerun {forcerun}" if forcerun else ""}' ) ################# #### Running #### ################# if unlock: print('* Unlocking previous run... *') run_simple(cmd + ' --unlock') print('* Now rerunning *') try: run_simple(cmd) except subprocess.CalledProcessError: logger.error('--------') logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') sys.exit(1) except KeyboardInterrupt: logger.error('--------') logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') sys.exit(1) else: logger.info('--------') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.info(f'Finished. Output directory: {output_dir}')