def run_model(self, raw_sig, n_can_state=None): if self.model_type == TAI_NAME: if any(arg is None for arg in (self.chunk_size, self.chunk_overlap, self.max_concur_chunks)): logger = logging.get_logger() logger.error('Must provide chunk_size, chunk_overlap, ' + 'max_concur_chunks in order to run the taiyaki ' + 'base calling backend.') try: trans_weights = self.tai_run_model(raw_sig, self.model, self.chunk_size, self.chunk_overlap, self.max_concur_chunks) except AttributeError: raise mh.MegaError('Out of date or incompatible model') except RuntimeError: raise mh.MegaError('Likely out of memory error.') if self.device != self.torch.device('cpu'): self.torch.cuda.empty_cache() if n_can_state is not None: trans_weights = (np.ascontiguousarray( trans_weights[:, :n_can_state]), np.ascontiguousarray( trans_weights[:, n_can_state:])) else: raise mh.MegaError('Invalid model type.') return trans_weights
def snps_validation(args, is_cat_mod, output_size, aligner): logger = logging.get_logger() if mh.WHATSHAP_MAP_NAME in args.outputs and not mh.SNP_NAME in args.outputs: args.outputs.append(mh.SNP_NAME) if mh.SNP_NAME in args.outputs and not mh.PR_SNP_NAME in args.outputs: args.outputs.append(mh.PR_SNP_NAME) if mh.PR_SNP_NAME in args.outputs and args.variant_filename is None: logger.error('{} output requested, '.format(mh.PR_SNP_NAME) + 'but --variant-filename provided.') sys.exit(1) if mh.PR_SNP_NAME in args.outputs and not ( is_cat_mod or mh.nstate_to_nbase(output_size) == 4): logger.error( 'SNP calling from naive modified base flip-flop model is ' + 'not supported.') sys.exit(1) snp_calib_fn = mh.get_snp_calibration_fn(args.snp_calibration_filename, args.disable_snp_calibration) try: snps_data = snps.SnpData( args.variant_filename, args.max_indel_size, args.snp_all_paths, args.write_snps_text, args.variant_context_bases, snp_calib_fn, snps.HAPLIOD_MODE if args.haploid else snps.DIPLOID_MODE, args.refs_include_snps, aligner) except mh.MegaError as e: logger.error(str(e)) sys.exit(1) if args.variant_filename is not None and mh.PR_SNP_NAME not in args.outputs: logger.warning( '--snps-filename provided, but SNP output not requested ' + '(via --outputs). Argument will be ignored.') return args, snps_data
def parse_pr_ref_output(args): logger = logging.get_logger() if args.output_per_read_references: args.outputs.append(mh.PR_REF_NAME) if args.refs_include_snps and args.refs_include_mods: logger.error('Cannot output both modified base and SNPs in ' + 'per-read references (remove one of ' + '--refs-include-snps or --refs-include-mods).') sys.exit(1) if args.refs_include_snps and mh.PR_SNP_NAME not in args.outputs: args.outputs.append(mh.PR_SNP_NAME) logger.warning('--refs-include-snps set, so adding ' + 'per_read_snps to --outputs.') if args.refs_include_mods and mh.PR_MOD_NAME not in args.outputs: args.outputs.append(mh.PR_MOD_NAME) logger.warning('--refs-include-mods set, so adding ' + 'per_read_mods to --outputs.') else: if args.refs_include_snps: logger.warning( '--refs-include-snps but not --output-per-read-references ' + 'set. Ignoring --refs-include-snps.') if args.refs_include_mods: logger.warning( '--refs-include-mods but not --output-per-read-references ' + 'set. Ignoring --refs-include-mods.') min_len, max_len = (args.refs_length_range if args.refs_length_range is not None else (None, None)) pr_ref_filts = mh.PR_REF_FILTERS( pct_idnt=args.refs_percent_identity_threshold, pct_cov=args.refs_percent_coverage_threshold, min_len=min_len, max_len=max_len) return args, pr_ref_filts
def _fill_files_queue(read_file_q, fast5s_dir, num_reads, read_ids_fn, recursive, num_ps, num_reads_conn): logger = logging.get_logger() valid_read_ids = None if read_ids_fn is not None: with open(read_ids_fn) as read_ids_fp: valid_read_ids = set(line.strip() for line in read_ids_fp) used_read_ids = set() # fill queue with read filename and read id tuples for fast5_fn, read_id in fast5_io.iterate_fast5_reads( fast5s_dir, num_reads, recursive): if valid_read_ids is not None and read_id not in valid_read_ids: continue if read_id in used_read_ids: logger.debug(('Read ID ({}) found in previous read and will not ' + 'process from {}.').format(read_id, fast5_fn)) continue if fast5_fn is None or read_id is None: continue read_file_q.put((fast5_fn, read_id)) used_read_ids.add(read_id) # add None to indicate that read processes should return for _ in range(num_ps): read_file_q.put((None, None)) num_reads_conn.send(len(used_read_ids)) return
def aligner_validation(args): logger = logging.get_logger() if len(mh.ALIGN_OUTPUTS.intersection(args.outputs)) > 0: if args.reference is None: logger.error( ('Output(s) requiring reference alignment requested ({}), ' + 'but --reference not provided.').format(', '.join( mh.ALIGN_OUTPUTS.intersection(args.outputs)))) sys.exit(1) logger.info('Loading reference.') if not (os.path.exists(args.reference) and os.path.isfile(args.reference)): logger.error('Provided reference file does not exist or is ' + 'not a file.') sys.exit(1) aligner = mapping.alignerPlus(str(args.reference), preset=str('map-ont'), best_n=1) setattr(aligner, 'out_fmt', args.mappings_format) setattr(aligner, 'ref_fn', mh.resolve_path(args.reference)) aligner.add_ref_lens() mapping.test_open_alignment_out_file(args.output_directory, aligner.out_fmt, aligner.ref_names_and_lens, aligner.ref_fn) else: aligner = None if args.reference is not None: logger.warning( '[--reference] provided, but no [--outputs] requiring ' + 'alignment was requested. Argument will be ignored.') return aligner
def add_diploid_probs(self, probs, gts): # phred scaled likelihoods with np.errstate(divide='ignore'): gl = np.log10(probs) raw_pl = -10 * gl # "normalized" PL values stored as decsribed by VCF format pl = np.minimum(raw_pl - raw_pl.min(), mh.MAX_PL_VALUE) s_pl = np.sort(pl) # add sample tags self.add_sample_field('GT', gts[np.argmax(probs)]) try: qual = int(np.minimum(np.around(raw_pl[0]), mh.MAX_PL_VALUE)) except ValueError: logger = logging.get_logger() logger.debug( 'NAN quality value encountered. gts:{}, probs:{}'.format( str(gts), str(probs))) qual = mh.MAX_PL_VALUE self.qual = '{:.0f}'.format(qual) if qual > 0 else '.' self.add_sample_field('GQ', '{:.0f}'.format(np.around(s_pl[1]))) self.add_sample_field( 'GL', ','.join('{:.2f}' for _ in range(probs.shape[0])).format(*gl)) self.add_sample_field( 'PL', ','.join('{:.0f}' for _ in range(probs.shape[0])).format(*np.around(pl))) return
def __init__(self, variant_fn, max_indel_size, all_paths, write_snps_txt, context_bases, snps_calib_fn=None, call_mode=DIPLOID_MODE, do_pr_ref_snps=False, aligner=None, keep_snp_fp_open=False): logger = logging.get_logger('snps') self.max_indel_size = max_indel_size self.all_paths = all_paths self.write_snps_txt = write_snps_txt self.snps_calib_fn = snps_calib_fn self.calib_table = calibration.SnpCalibrator(self.snps_calib_fn) self.context_bases = context_bases if len(self.context_bases) != 2: raise mh.MegaError( 'Must provide 2 context bases values (for single base SNPs ' + 'and indels).') self.call_mode = call_mode self.do_pr_ref_snps = do_pr_ref_snps self.variant_fn = variant_fn self.variants_idx = None if self.variant_fn is None: return logger.info('Loading variants.') vars_idx = pysam.VariantFile(self.variant_fn) try: contigs = list(vars_idx.header.contigs.keys()) vars_idx.fetch(next(iter(contigs)), 0, 0) except ValueError: logger.warn( 'Variants file must be indexed. Performing indexing now.') vars_idx.close() self.variant_fn = index_variants(self.variant_fn) vars_idx = pysam.VariantFile(self.variant_fn) if keep_snp_fp_open: self.variants_idx = vars_idx else: vars_idx.close() self.variants_idx = None if aligner is None: raise mh.MegaError( 'Must provide aligner if SNP filename is provided') if len(set(aligner.ref_names_and_lens[0]).intersection(contigs)) == 0: raise mh.MegaError(( 'Reference and variant files contain no chromosomes/contigs ' + 'in common.\n\t\tFirst 3 reference contigs:\t{}\n\t\tFirst 3 ' + 'variant file contigs:\t{}').format( ', '.join(aligner.ref_names_and_lens[0][:3]), ', '.join(contigs[:3]))) return
def profile_validation(args): logger = logging.get_logger() if args.processes > 1: msg = ('Running profiling with multiple processes is ' + 'not allowed. Setting to single process.') args.processes = 1 else: msg = 'Running profiling. This may slow processing.' logger.warning(msg) return args
def sort_and_index_mapping(map_fn, out_fn, ref_fn=None, do_index=False): sort_args = ['-O', 'BAM', '-o', out_fn, map_fn] if ref_fn is not None: sort_args.extend(('--reference', ref_fn)) try: pysam.sort(*sort_args) if do_index: sleep(1) pysam.index(out_fn) except pysam.utils.SamtoolsError: logger = logging.get_logger() logger.warning('Sorting and/or indexing mapping failed.') return
def mkdir(out_dir, overwrite): logger = logging.get_logger() if os.path.exists(out_dir): if not overwrite: sys.stderr.write( 'ERROR: --output-directory exists and --overwrite is ' + 'not set.\n') sys.exit(1) if os.path.isfile(out_dir) or os.path.islink(out_dir): os.remove(out_dir) else: shutil.rmtree(out_dir) os.mkdir(out_dir) return
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) logger = logging.get_logger() logger.info('Opening new sequence variant statistics database') out_vars_db = variants.VarsDb( mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME), read_only=False, loc_index_in_memory=not args.var_locations_on_disk, uuid_index_in_memory=True) for mega_dir in args.megalodon_results_dirs: logger.info( 'Adding sequence variant statistics from {}'.format(mega_dir)) # full read only mode with no indices read into memory vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir, mh.PR_VAR_NAME), read_only=True, chrm_index_in_memory=False, alt_index_in_memory=False, uuid_index_in_memory=False) bar = tqdm(desc=mega_dir, total=vars_db.get_num_uniq_stats(), smoothing=0, dynamic_ncols=True) for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end, test_start, chrm, chrm_len) in vars_db.iter_data(): chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len) loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start, test_end, pos, ref_seq, var_name) alt_id = out_vars_db.get_alt_id_or_insert(alt_seq) read_id = out_vars_db.get_read_id_or_insert(uuid) out_vars_db.insert_data(score, loc_id, alt_id, read_id) bar.update() bar.close() logger.info('Creating indices and closing database') if out_vars_db.chrm_idx_in_mem: out_vars_db.create_chrm_index() if out_vars_db.loc_idx_in_mem: out_vars_db.create_loc_index() if out_vars_db.alt_idx_in_mem: out_vars_db.create_alt_index() out_vars_db.create_data_covering_index() out_vars_db.close()
def main(): args = get_parser().parse_args() log_suffix = ('aggregation' if args.output_suffix is None else 'aggregation.' + args.output_suffix) logging.init_logger(args.output_directory, out_suffix=log_suffix) logger = logging.get_logger() mod_agg_info = mods.AGG_INFO(mods.BIN_THRESH_NAME, args.mod_binary_threshold) mod_names = [] if mh.MOD_NAME in args.outputs: logger.info('Loading model.') mod_names = backends.ModelInfo( mh.get_model_fn(args.taiyaki_model_filename)).mod_long_names if args.reference is not None: logger.info('Loading reference.') aligner = mapping.alignerPlus(str(args.reference), preset=str('map-ont'), best_n=1) if args.reference is not None: aligner.add_ref_lens() valid_read_ids = None if args.read_ids_filename is not None: with open(args.read_ids_filename) as read_ids_fp: valid_read_ids = set(line.strip() for line in read_ids_fp) aggregate.aggregate_stats( args.outputs, args.output_directory, args.processes, args.write_vcf_log_probs, args.heterozygous_factors, snps.HAPLIOD_MODE if args.haploid else snps.DIPLOID_MODE, mod_names, mod_agg_info, args.write_mod_log_probs, args.mod_output_formats, args.suppress_progress, aligner.ref_names_and_lens, valid_read_ids, args.output_suffix) # note reference is required in order to annotate contigs for VCF writing if mh.SNP_NAME in args.outputs and args.reference is not None: logger.info('Sorting output variant file') variant_fn = mh.add_fn_suffix( mh.get_megalodon_fn(args.output_directory, mh.SNP_NAME), args.output_suffix) sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted') snps.sort_variants(variant_fn, sort_variant_fn) logger.info('Indexing output variant file') index_var_fn = snps.index_variants(sort_variant_fn) return
def mods_validation(args, model_info): logger = logging.get_logger() if args.refs_include_mods and mh.PR_MOD_NAME not in args.outputs: # TODO don't really have to output this data, but have to compute it # so sort out how to compute the output but not output it args.outputs.append(mh.PR_MOD_NAME) if mh.PR_MOD_NAME not in args.outputs and mh.MOD_NAME in args.outputs: args.outputs.append(mh.PR_MOD_NAME) if mh.PR_MOD_NAME in args.outputs and not model_info.is_cat_mod: logger.error( '{} output requested, '.format(mh.PR_MOD_NAME) + 'but model provided is not a categotical modified base model.\n' + 'Note that modified base calling from naive modified base ' + 'model is not currently supported.') sys.exit(1) if (model_info.is_cat_mod and mh.PR_MOD_NAME not in args.outputs and mh.BC_MODS_NAME not in args.outputs): logger.warning( ('Categorical modifications model provided, but neither {} nor ' + '{} requested (via --outputs). Modified base output will not be ' + 'produced.').format( mh.PR_MOD_NAME, mh.BC_MODS_NAME)) if args.mod_motif is not None and mh.PR_MOD_NAME not in args.outputs: logger.warning(( '--mod-motif provided, but {} not requested (via --outputs). ' + 'Argument will be ignored.').format(mh.PR_MOD_NAME)) if args.refs_include_mods and mh.PR_REF_NAME not in args.outputs: logger.warning(( '--refs-include-mods provided, but {} not requested ' + '(via --outputs). Argument will be ignored.').format( mh.PR_REF_NAME)) mod_calib_fn = mh.get_mod_calibration_fn( args.mod_calibration_filename, args.disable_mod_calibration) mods_info = mods.ModInfo( model_info, args.mod_motif, args.mod_all_paths, args.write_mods_text, args.mod_context_bases, mh.BC_MODS_NAME in args.outputs, args.refs_include_mods, mod_calib_fn, args.mod_output_formats) return args, mods_info
import os import sys from collections import defaultdict import numpy as np import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages from megalodon import calibration, logging, megalodon_helper as mh from ._extras_parsers import get_parser_calibrate_variants LOGGER = logging.get_logger() INVALID_CALIB_MSG = ( 'Encountered invalid distributions for calibration. Not saving ' + 'calibration file, but pdf will be plotted in order to identify ' + 'potential issues.') def plot_calib(pdf_fp, var_type, smooth_ls, s_ref, sm_ref, s_alt, sm_alt, mono_prob, prob_alt): f, axarr = plt.subplots(3, sharex=True, figsize=(11, 7)) axarr[0].plot(smooth_ls, s_ref, color='orange') axarr[0].plot(smooth_ls, sm_ref, color='red') axarr[0].plot(smooth_ls, s_alt, color='grey') axarr[0].plot(smooth_ls, sm_alt, color='blue') axarr[0].set_ylabel( 'Probability Density\nred/orange=canonical\nblue/grey=modified') axarr[0].set_title(var_type + ' Calibration') axarr[1].plot(smooth_ls, mono_prob, color='orange') axarr[1].plot(smooth_ls, 1 / (np.exp(smooth_ls) + 1), color='purple') axarr[1].set_ylabel(
def _get_fail_queue(failed_reads_q, f_conn, getter_num_reads_conn, num_update_errors, suppress_progress): def update_prog(reads_called, sig_called, unexp_err_fp): if is_err: failed_reads[err_type].append(fast5_fn) if err_type == _UNEXPECTED_ERROR_CODE: if len(failed_reads[_UNEXPECTED_ERROR_CODE]) == 1: unexp_err_fp = open( _UNEXPECTED_ERROR_FN.format(np.random.randint(10000)), 'w') if len(failed_reads[err_type]) >= _MAX_NUM_UNEXP_ERRORS: unexp_err_fp.close() else: unexp_err_fp.write(fast5_fn + '\n:::\n' + err_tb + '\n\n\n') unexp_err_fp.flush() if do_update_prog: if not suppress_progress: try: bar.set_postfix({ 'ksample/s': (sig_called / 1000) / bar.format_dict['elapsed'] }) except AttributeError: # sometimes get no format_dict error # so don't include ksample/s if so pass bar.update(1) reads_called += 1 if num_update_errors > 0: bar.write(prog_prefix + format_fail_summ(bar_header, [ (len(fns), err) for err, fns in failed_reads.items() ], reads_called, num_update_errors), file=sys.stderr) return reads_called, unexp_err_fp logger = logging.get_logger() logger.info('Processing reads.') reads_called, sig_called = 0, 0 unexp_err_fp = None failed_reads = defaultdict(list) bar, prog_prefix, bar_header = prep_errors_bar(num_update_errors, None, suppress_progress) while True: try: try: (is_err, do_update_prog, err_type, fast5_fn, err_tb, n_sig) = failed_reads_q.get(block=False) sig_called += n_sig reads_called, unexp_err_fp = update_prog( reads_called, sig_called, unexp_err_fp) except queue.Empty: # get total number of reads once all reads are enumerated if bar is not None and bar.total is None: if getter_num_reads_conn.poll(): bar.total = getter_num_reads_conn.recv() else: # if all reads are done signal was sent from main thread if f_conn.poll(): break sleep(0.1) continue except KeyboardInterrupt: # exit gracefully on keyboard inturrupt return if not suppress_progress: bar.close() if len(failed_reads[_UNEXPECTED_ERROR_CODE]) >= 1: logger.warning( ('Unexpected errors occured. See full ' + 'error stack traces for first (up to) {0:d} errors in ' + '"{1}"').format(_MAX_NUM_UNEXP_ERRORS, unexp_err_fp.name)) if any(len(fns) > 0 for fns in failed_reads.values()): logger.info( format_fail_summ( 'Unsuccessful processing types:', [(len(fns), err) for err, fns in failed_reads.items() if len(fns) > 0], reads_called)) # TODO flag to output failed read names to file return
def _main(): args = get_parser().parse_args() mkdir(args.output_directory, args.overwrite) logging.init_logger(args.output_directory) logger = logging.get_logger() logger.debug('Command: """' + ' '.join(sys.argv) + '"""') if _DO_PROFILE: args = profile_validation(args) args, pr_ref_filts = parse_pr_ref_output(args) tai_model_fn = mh.get_model_fn(args.taiyaki_model_filename) model_info = backends.ModelInfo(tai_model_fn, args.devices, args.processes, args.chunk_size, args.chunk_overlap, args.max_concurrent_chunks) args, mods_info = mods_validation(args, model_info) aligner = aligner_validation(args) args, snps_data = snps_validation(args, model_info.is_cat_mod, model_info.output_size, aligner) process_all_reads(args.fast5s_dir, not args.not_recursive, args.num_reads, args.read_ids_filename, model_info, args.outputs, args.output_directory, args.basecalls_format, aligner, snps_data, args.processes, args.verbose_read_progress, args.suppress_progress, mods_info, args.database_safety, args.edge_buffer, pr_ref_filts) if mh.MAP_NAME in args.outputs: logger.info('Spawning process to sort mappings') map_p = post_process_mapping(args.output_directory, aligner.out_fmt, aligner.ref_fn) if mh.WHATSHAP_MAP_NAME in args.outputs: logger.info('Spawning process to sort whatshap mappings') whatshap_sort_fn, whatshap_p = post_process_whatshap( args.output_directory, aligner.out_fmt, aligner.ref_fn) if mh.SNP_NAME in args.outputs or mh.MOD_NAME in args.outputs: post_process_aggregate( mods_info, args.outputs, args.mod_binary_threshold, args.output_directory, args.processes, args.write_vcf_log_probs, args.heterozygous_factors, snps_data, args.write_mod_log_probs, args.suppress_progress, aligner.ref_names_and_lens) if mh.SNP_NAME in args.outputs: logger.info('Sorting output variant file') variant_fn = mh.get_megalodon_fn(args.output_directory, mh.SNP_NAME) sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted') snps.sort_variants(variant_fn, sort_variant_fn) logger.info('Indexing output variant file') index_variant_fn = snps.index_variants(sort_variant_fn) if mh.WHATSHAP_MAP_NAME in args.outputs: if whatshap_p.is_alive(): logger.info('Waiting for whatshap mappings sort') while whatshap_p.is_alive(): sleep(0.1) logger.info( snps.get_whatshap_command(index_variant_fn, whatshap_sort_fn, mh.add_fn_suffix(variant_fn, 'phased'))) if mh.MAP_NAME in args.outputs: if map_p.is_alive(): logger.info('Waiting for mappings sort') while map_p.is_alive(): sleep(0.1) return
def aggregate_stats(outputs, out_dir, num_ps, write_vcf_lp, het_factors, call_mode, mod_names, mod_agg_info, write_mod_lp, mod_output_fmts, suppress_progress, ref_names_and_lens, valid_read_ids=None, out_suffix=None): if mh.SNP_NAME in outputs and mh.MOD_NAME in outputs: num_ps = max(num_ps // 2, 1) logger = logging.get_logger('agg') num_snps, num_mods, snp_prog_q, mod_prog_q = (0, 0, queue.Queue(), queue.Queue()) if mh.SNP_NAME in outputs: snps_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME) logger.info('Computing number of unique variants.') num_snps = snps.AggSnps(snps_db_fn).num_uniq() logger.info('Spawning variant aggregation processes.') # create process to collect snp stats from workers snp_stats_q, snp_stats_p, main_snp_stats_conn = mh.create_getter_q( _get_snp_stats_queue, (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp)) # create process to fill snp locs queue snp_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) snp_filler_p = mp.Process(target=_fill_locs_queue, args=(snp_filler_q, snps_db_fn, snps.AggSnps, num_ps), daemon=True) snp_filler_p.start() # create worker processes to aggregate snps snp_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_snps_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_snps_worker, args=(snp_filler_q, snp_stats_q, snp_prog_q, snps_db_fn, write_vcf_lp, het_factors, call_mode, valid_read_ids), daemon=True) p.start() agg_snps_ps.append(p) if mh.MOD_NAME in outputs: mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME) num_mods = mods.AggMods(mods_db_fn).num_uniq() logger.info('Spawning modified base aggregation processes.') # create process to collect mods stats from workers mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q( _get_mod_stats_queue, (out_dir, mod_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts)) # create process to fill mod locs queue mod_filler_q = mp.Queue(maxsize=100000) mod_fill_limit = _N_MOD_PROF if _DO_PROF else None mod_filler_p = mp.Process(target=_fill_locs_queue, args=(mod_filler_q, mods_db_fn, mods.AggMods, num_ps, mod_fill_limit), daemon=True) mod_filler_p.start() # create worker processes to aggregate mods mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_mods_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_mods_worker, args=(mod_filler_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp), daemon=True) p.start() agg_mods_ps.append(p) # create progress process logger.info('Aggregating {} SNPs and {} mod sites over reads.'.format( num_snps, num_mods)) main_prog_conn, prog_conn = mp.Pipe() prog_p = mp.Process(target=_agg_prog_worker, args=(snp_prog_q, mod_prog_q, num_snps, num_mods, prog_conn, suppress_progress), daemon=True) prog_p.start() # join filler processes first if mh.SNP_NAME in outputs: snp_filler_p.join() for agg_snps_p in agg_snps_ps: agg_snps_p.join() # send to conn if snp_stats_p.is_alive(): main_snp_stats_conn.send(True) snp_stats_p.join() if mh.MOD_NAME in outputs: for agg_mods_p in agg_mods_ps: agg_mods_p.join() if mod_stats_p.is_alive(): main_mod_stats_conn.send(True) mod_stats_p.join() if prog_p.is_alive(): main_prog_conn.send(True) prog_p.join() return
def call_read_snps(snps_data, r_ref_pos, edge_buffer, r_ref_seq, rl_cumsum, r_to_q_poss, r_post, post_mapped_start): # call all snps overlapping this read r_snp_calls = [] for (snp_ref_seq, snp_alt_seqs, snp_id, snp_ref_pos) in snps_data.iter_overlapping_snps( r_ref_pos, edge_buffer): if r_ref_pos.strand == 1: read_pos = snp_ref_pos - r_ref_pos.start read_ref_seq = snp_ref_seq read_alt_seqs = snp_alt_seqs else: read_pos = r_ref_pos.end - snp_ref_pos - len(snp_ref_seq) read_ref_seq = mh.revcomp(snp_ref_seq) read_alt_seqs = [mh.revcomp(alt_seq) for alt_seq in snp_alt_seqs] # select single base SNP or indel context width snp_context_bases = snps_data.indel_context if all( len(snp_ref_seq) == len(snp_alt_seq) for snp_alt_seq in snp_alt_seqs) else snps_data.snp_context pos_bb = min(snp_context_bases, read_pos) pos_ab = min(snp_context_bases, r_ref_seq.shape[0] - read_pos - len(read_ref_seq)) pos_ref_seq = r_ref_seq[read_pos - pos_bb:read_pos + pos_ab + len(read_ref_seq)] # TODO move this to an initial check of a small number of variants # against the reference if any(pos_ref_seq[pos_bb:pos_bb + len(snp_ref_seq)] != np.array( [mh.ALPHABET.find(b) for b in read_ref_seq])): # variant reference sequence does not match fasta reference logger = logging.get_logger() logger.debug( '*' * 10 + 'Refernce seq at {} expected {}[{}]{} got "{}"'.format( snp_ref_pos, ''.join(mh.ALPHABET[b] for b in pos_ref_seq[pos_bb - 3:pos_bb]), ''.join(mh.ALPHABET[b] for b in pos_ref_seq[pos_bb:pos_bb + len(snp_ref_seq)]), ''.join( mh.ALPHABET[b] for b in pos_ref_seq[pos_bb + len(snp_ref_seq):pos_bb + len(snp_ref_seq) + 3]), read_ref_seq, ) + '*' * 10) continue blk_start = rl_cumsum[r_to_q_poss[read_pos - pos_bb]] blk_end = rl_cumsum[r_to_q_poss[read_pos + pos_ab] + 1] if blk_end - blk_start < max( len(pos_ref_seq), max(len(read_alt_seq) for read_alt_seq in read_alt_seqs)): # no valid mapping over large inserted query bases # i.e. need as many "events/strides" as bases for valid mapping continue loc_ref_score = score_seq(r_post, pos_ref_seq, post_mapped_start + blk_start, post_mapped_start + blk_end, snps_data.all_paths) loc_alt_llrs = [] for read_alt_seq in read_alt_seqs: pos_alt_seq = np.concatenate([ pos_ref_seq[:pos_bb], np.array([mh.ALPHABET.find(b) for b in read_alt_seq], dtype=np.uintp), pos_ref_seq[pos_bb + len(snp_ref_seq):] ]) loc_alt_score = score_seq(r_post, pos_alt_seq, post_mapped_start + blk_start, post_mapped_start + blk_end, snps_data.all_paths) # calibrate log probs loc_alt_llrs.append( snps_data.calibrate_llr(loc_ref_score - loc_alt_score, read_ref_seq, read_alt_seq)) # due to calibration mutli-allelic log likelihoods could result in # inferred negative reference likelihood, so re-normalize here loc_alt_log_ps = calibration.compute_log_probs(np.array(loc_alt_llrs)) r_snp_calls.append( (snp_ref_pos, loc_alt_log_ps, snp_ref_seq, snp_alt_seqs, snp_id)) return r_snp_calls
def _agg_prog_worker(snp_prog_q, mod_prog_q, num_snps, num_mods, prog_conn, suppress_progress): snp_bar, mod_bar = None, None if num_snps > 0: if num_mods > 0 and not suppress_progress: mod_bar = tqdm(desc='Mods', unit=' sites', total=num_mods, position=1, smoothing=0, dynamic_ncols=True) snp_bar = tqdm(desc='SNPs', unit=' sites', total=num_snps, position=0, smoothing=0, dynamic_ncols=True) elif not suppress_progress: snp_bar = tqdm(desc='SNPs', unit=' sites', total=num_snps, position=0, smoothing=0, dynamic_ncols=True) elif num_mods > 0 and not suppress_progress: mod_bar = tqdm(desc='Mods', unit=' sites', total=num_mods, position=0, smoothing=0, dynamic_ncols=True) logger = logging.get_logger() while True: try: snp_prog_q.get(block=False) if not suppress_progress: if snp_bar is not None: snp_bar.update(1) if mod_bar is not None: mod_bar.update(0) except queue.Empty: try: mod_prog_q.get(block=False) if not suppress_progress: if snp_bar is not None: snp_bar.update(0) if mod_bar is not None: mod_bar.update(1) except queue.Empty: sleep(0.001) if prog_conn.poll(): break continue while not snp_prog_q.empty(): snp_prog_q.get(block=False) if not suppress_progress: snp_bar.update(1) while not mod_prog_q.empty(): mod_prog_q.get(block=False) if not suppress_progress: mod_bar.update(1) if snp_bar is not None: snp_bar.close() if mod_bar is not None: mod_bar.close() if num_mods > 0 and num_snps > 0 and not suppress_progress: sys.stderr.write('\n\n') return
def process_all_reads(fast5s_dir, recursive, num_reads, read_ids_fn, model_info, outputs, out_dir, bc_fmt, aligner, snps_data, num_ps, num_update_errors, suppress_progress, mods_info, db_safety, edge_buffer, pr_ref_filts): logger = logging.get_logger() logger.info('Preparing workers to process reads.') # read filename queue filler # Note no maxsize for this queue to compute total number of reads while # also not delaying read processing read_file_q = mp.Queue() num_reads_conn, getter_num_reads_conn = mp.Pipe() files_p = mp.Process(target=_fill_files_queue, args=(read_file_q, fast5s_dir, num_reads, read_ids_fn, recursive, num_ps, num_reads_conn), daemon=True) files_p.start() # progress and failed reads getter (no limit on failed reads queue # in case error occurs there, don't halt run failed_reads_q, f_p, main_f_conn = mh.create_getter_q( _get_fail_queue, (getter_num_reads_conn, num_update_errors, suppress_progress), max_size=None) # start output type getters/writers (bc_q, bc_p, main_bc_conn, mo_q, mo_p, main_mo_conn, snps_q, snps_p, main_snps_conn, mods_q, mods_p, main_mods_conn) = [ None, ] * 12 if mh.BC_NAME in outputs or mh.BC_MODS_NAME in outputs: if mh.BC_NAME not in outputs: outputs.append(mh.BC_NAME) bc_q, bc_p, main_bc_conn = mh.create_getter_q( _get_bc_queue, (out_dir, bc_fmt, mods_info.do_output_mods, mods_info.mod_long_names)) if mh.MAP_NAME in outputs: do_output_pr_refs = (mh.PR_REF_NAME in outputs and not mods_info.do_pr_ref_mods and not snps_data.do_pr_ref_snps) mo_q, mo_p, main_mo_conn = mh.create_getter_q( mapping._get_map_queue, (out_dir, aligner.ref_names_and_lens, aligner.out_fmt, aligner.ref_fn, do_output_pr_refs, pr_ref_filts)) if mh.PR_SNP_NAME in outputs: pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if ( mh.PR_REF_NAME in outputs and snps_data.do_pr_ref_snps) else None whatshap_map_fn = ( mh.get_megalodon_fn(out_dir, mh.WHATSHAP_MAP_NAME) + '.' + aligner.out_fmt) if mh.WHATSHAP_MAP_NAME in outputs else None snps_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_SNP_TXT_NAME) if snps_data.write_snps_txt else None) snps_q, snps_p, main_snps_conn = mh.create_getter_q( snps._get_snps_queue, (mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME), snps_txt_fn, db_safety, pr_refs_fn, pr_ref_filts, whatshap_map_fn, aligner.ref_names_and_lens, aligner.ref_fn)) if mh.PR_MOD_NAME in outputs: pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if ( mh.PR_REF_NAME in outputs and mods_info.do_pr_ref_mods) else None mods_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_MOD_TXT_NAME) if mods_info.write_mods_txt else None) mods_q, mods_p, main_mods_conn = mh.create_getter_q( mods._get_mods_queue, (mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME), mods_txt_fn, db_safety, pr_refs_fn, pr_ref_filts)) proc_reads_ps, map_conns = [], [] for device in model_info.process_devices: if aligner is None: map_conn, caller_conn = None, None else: map_conn, caller_conn = mp.Pipe() map_conns.append(map_conn) p = mp.Process(target=_process_reads_worker, args=(read_file_q, bc_q, snps_q, failed_reads_q, mods_q, caller_conn, model_info, snps_data, mods_info, edge_buffer, device)) p.daemon = True p.start() proc_reads_ps.append(p) sleep(0.1) # perform mapping in threads for mappy shared memory interface # open threads after all processes have started due to python # multiprocess combined with threading instability if aligner is None: map_read_ts = None else: map_read_ts = [] for map_conn in map_conns: t = threading.Thread(target=mapping._map_read_worker, args=(aligner, map_conn, mo_q)) t.daemon = True t.start() map_read_ts.append(t) try: files_p.join() for proc_reads_p in proc_reads_ps: proc_reads_p.join() if map_read_ts is not None: for map_t in map_read_ts: map_t.join() # comm to getter processes to return if f_p.is_alive(): main_f_conn.send(True) f_p.join() for on, p, main_conn in ((mh.BC_NAME, bc_p, main_bc_conn), (mh.MAP_NAME, mo_p, main_mo_conn), (mh.PR_SNP_NAME, snps_p, main_snps_conn), (mh.PR_MOD_NAME, mods_p, main_mods_conn)): if on in outputs and p.is_alive(): main_conn.send(True) if on == mh.PR_SNP_NAME: logger.info( 'Waiting for snps database to complete indexing.') elif on == mh.PR_MOD_NAME: logger.info( 'Waiting for mods database to complete indexing.') p.join() except KeyboardInterrupt: logger.error('Exiting due to keyboard interrupt.') sys.exit(1) return
def __init__(self, model_info, all_mod_motifs_raw=None, mod_all_paths=False, write_mods_txt=None, mod_context_bases=None, do_output_mods=False, do_pr_ref_mods=False, mods_calib_fn=None, mod_output_fmts=[mh.MOD_BEDMETHYL_NAME]): logger = logging.get_logger() # this is pretty hacky, but these attributes are stored here as # they are generally needed alongside other alphabet info # don't want to pass all of these parameters around individually though # as this would make function signatures too complicated self.mod_all_paths = mod_all_paths self.write_mods_txt = write_mods_txt self.mod_context_bases = mod_context_bases self.do_output_mods = do_output_mods self.do_pr_ref_mods = do_pr_ref_mods self.mod_long_names = model_info.mod_long_names self.calib_table = calibration.ModCalibrator(mods_calib_fn) self.mod_output_fmts = mod_output_fmts self.alphabet = model_info.can_alphabet self.ncan_base = len(self.alphabet) try: self.alphabet = self.alphabet.decode() except: pass if model_info.is_cat_mod: # TODO also output "(alt to C)" for each mod logger.info( 'Using canoncical alphabet {} and modified bases {}.'.format( self.alphabet, ' '.join('{}={}'.format(*mod_b) for mod_b in model_info.mod_long_names))) else: logger.info('Using canoncical alphabet {}.'.format(self.alphabet)) self.nbase = len(self.alphabet) self.n_can_state = (self.ncan_base + self.ncan_base) * (self.ncan_base + 1) if model_info.is_cat_mod: self.nmod_base = model_info.n_mods self.can_base_mods = model_info.can_base_mods self.can_mods_offsets = model_info.can_indices self.str_to_int_mod_labels = model_info.str_to_int_mod_labels assert ( model_info.output_size - self.n_can_state == self.nmod_base + 1), ('Alphabet ({}) and model number of modified bases ({}) ' + 'do not agree.').format( self.alphabet, model_info.output_size - self.n_can_state - 1) else: self.nmod_base = 0 self.can_base_mods = {} self.can_mods_offsets = None self.str_to_int_mod_labels = None # parse mod motifs or use "swap" base if no motif provided self._parse_mod_motifs(all_mod_motifs_raw) return
def _load_taiyaki_model(self): if any(arg is None for arg in (self.chunk_size, self.chunk_overlap, self.max_concur_chunks)): logger = logging.get_logger() logger.debug('Must provide chunk_size, chunk_overlap, ' + 'max_concur_chunks in order to run the taiyaki ' + 'base calling backend.') self.model_type = TAI_NAME if self.devices is None: self.devices = [ 'cpu', ] base_proc_per_device = int(np.ceil(self.num_proc / len(self.devices))) procs_per_device = np.repeat(base_proc_per_device, len(self.devices)) if base_proc_per_device * len(self.devices) > self.num_proc: procs_per_device[-(base_proc_per_device * len(self.devices) - self.num_proc):] -= 1 assert sum(procs_per_device) == self.num_proc self.process_devices = [ dv for dv, n_dv in zip(self.devices, procs_per_device) for _ in range(n_dv) ] try: # import modules from taiyaki.helpers import load_model as load_taiyaki_model from taiyaki.basecall_helpers import run_model as tai_run_model from taiyaki.layers import GlobalNormFlipFlopCatMod import torch except ImportError: logger.error( 'Failed to import taiyaki and pytorch. Ensure working ' + 'installations to run megalodon') sys.exit(1) # store modules in object self.load_taiyaki_model = load_taiyaki_model self.tai_run_model = tai_run_model self.torch = torch tmp_model = self.load_taiyaki_model(self.taiyaki_model_fn) ff_layer = tmp_model.sublayers[-1] self.is_cat_mod = (GlobalNormFlipFlopCatMod is not None and isinstance(ff_layer, GlobalNormFlipFlopCatMod)) self.output_size = ff_layer.size if self.is_cat_mod: # Modified base model is defined by 3 fixed fields in taiyaki # can_nmods, output_alphabet and modified_base_long_names self.output_alphabet = ff_layer.output_alphabet self.can_nmods = ff_layer.can_nmods self.ordered_mod_long_names = ff_layer.ordered_mod_long_names # parse these values to more user-friendly data structures self.can_alphabet = '' self.can_indices = [] self.mod_long_names = [] self.str_to_int_mod_labels = {} self.can_base_mods = defaultdict(list) curr_can_offset = 0 curr_nmods = 0 for can_base_nmods in self.can_nmods: can_base = self.output_alphabet[curr_can_offset] self.can_alphabet += can_base self.can_indices.append(curr_can_offset) for mod_i, mod_base in enumerate( self.output_alphabet[curr_can_offset + 1:curr_can_offset + can_base_nmods + 1]): self.mod_long_names.append( (mod_base, self.ordered_mod_long_names[curr_nmods + mod_i])) self.str_to_int_mod_labels[mod_base] = mod_i + 1 self.can_base_mods[can_base].append(mod_base) curr_can_offset += can_base_nmods + 1 curr_nmods += can_base_nmods self.can_indices.append(curr_can_offset) self.can_indices = np.array(self.can_indices).astype(np.uintp) self.can_base_mods = dict(self.can_base_mods) else: if mh.nstate_to_nbase(ff_layer.size) != 4: raise NotImplementedError( 'Naive modified base flip-flop models are not ' + 'supported.') self.output_alphabet = mh.ALPHABET self.can_alphabet = mh.ALPHABET self.mod_long_names = [] self.str_to_int_mod_labels = {} self.n_mods = len(self.mod_long_names) return
def _get_mods_queue(mods_q, mods_conn, mods_db_fn, db_safety, ref_names_and_lens, mods_txt_fn, pr_refs_fn, pr_ref_filts): def get_mod_call(been_warned): # note strand is +1 for fwd or -1 for rev r_mod_scores, (read_id, chrm, strand, r_start, ref_seq, read_len, q_st, q_en, cigar) = mods_q.get(block=False) try: mods_db.insert_read_scores(r_mod_scores, read_id, chrm, strand) except Exception as e: if not been_warned: logger.warning( 'Error inserting modified base scores into database. See ' + 'log debyg output for error details.') been_warned = True import traceback var = traceback.format_exc() logger.debug( 'Error inserting modified base scores into database: ' + str(e) + '\n' + var) if mods_txt_fp is not None and len(r_mod_scores) > 0: # would involve batching and creating several conversion tables # for var strings (read_if and chrms). mods_txt_fp.write('\n'.join(( ('\t'.join('{}' for _ in field_names) ).format(read_id, chrm, strand, pos, mod_lp, np.log1p(-np.exp(mod_lps).sum()), mod_base, '{}:{}'.format(raw_motif, rel_pos)) for pos, mod_lps, mod_bases, ref_motif, rel_pos, raw_motif in r_mod_scores for mod_lp, mod_base in zip(mod_lps, mod_bases))) + '\n') if pr_refs_fn is not None: if not mapping.read_passes_filters(pr_ref_filts, read_len, q_st, q_en, cigar): return pr_refs_fp.write('>{}\n{}\n'.format( read_id, annotate_mods(r_start, ref_seq, r_mod_scores, strand))) return been_warned logger = logging.get_logger('mods') been_warned = False mods_db = ModsDb(mods_db_fn, db_safety=db_safety, read_only=False, pos_index_in_memory=True) for ref_name in ref_names_and_lens[0]: mods_db.insert_chrm(ref_name) mods_db.create_chrm_index() if mods_txt_fn is None: mods_txt_fp = None else: mods_txt_fp = open(mods_txt_fn, 'w') field_names = ('read_id', 'chrm', 'strand', 'pos', 'mod_log_prob', 'can_log_prob', 'mod_base', 'motif') mods_txt_fp.write('\t'.join(field_names) + '\n') if pr_refs_fn is not None: pr_refs_fp = open(pr_refs_fn, 'w') while True: try: been_warned = get_mod_call(been_warned) except queue.Empty: if mods_conn.poll(): break sleep(0.001) continue while not mods_q.empty(): been_warned = get_mod_call(been_warned) if mods_txt_fp is not None: mods_txt_fp.close() if pr_refs_fn is not None: pr_refs_fp.close() mods_db.create_mod_index() mods_db.create_data_covering_index() mods_db.close() return