def extract_data_worker( in_db_fns_q, data_q, out_mods_db_fn, batch_size, force_uint32, db_safety): # load output database with all in memory indices out_mods_db = mods.ModsDb( out_mods_db_fn, read_only=True, in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True, in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True, force_uint32_pos_to_dbid=force_uint32, db_safety=db_safety) while True: try: in_mod_db_fn = in_db_fns_q.get(block=True, timeout=1) except queue.Empty: sleep(0.001) continue if in_mod_db_fn is None: break mods_db = mods.ModsDb(in_mod_db_fn) batch_data = [] for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos, chrm, chrm_len) in mods_db.iter_data(): batch_data.append((score, *get_data_dbids( out_mods_db, chrm, strand, pos, (mod_base, motif, motif_pos, raw_motif), uuid))) if len(batch_data) >= batch_size: data_q.put(batch_data) batch_data = [] if len(batch_data) > 0: data_q.put(batch_data) mods_db.close() out_mods_db.db.commit() out_mods_db.close()
def main(): args = get_parser().parse_args() megalodon.mkdir(args.output_megalodon_results_dir, False) out_mods_db = mods.ModsDb( mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME), read_only=False, pos_index_in_memory=not args.mod_positions_on_disk) for mega_dir in args.megalodon_results_dirs: # full read only mode with no indices read into memory mods_db = mods.ModsDb(mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME), read_only=True, chrm_index_in_memory=False, mod_index_in_memory=False, uuid_index_in_memory=False) for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos, chrm, chrm_len) in mods_db.iter_data(): chrm_id = out_mods_db.get_chrm_id_or_insert(chrm, chrm_len) pos_id = out_mods_db.get_pos_id_or_insert(chrm_id, strand, pos) mod_base_id = out_mods_db.get_mod_base_id_or_insert( mod_base, motif, motif_pos, raw_motif) read_id = out_mods_db.get_read_id_or_insert(uuid) out_mods_db.insert_data(score, pos_id, mod_base_id, read_id) if out_mods_db.chrm_idx_in_mem: out_mods_db.create_chrm_index() if out_mods_db.pos_idx_in_mem: out_mods_db.create_pos_index() if out_mods_db.mod_idx_in_mem: out_mods_db.create_mod_index() out_mods_db.create_data_covering_index() out_mods_db.close()
def insert_data(in_mod_db_fns, out_mods_db, batch_size): LOGGER.info('Inserting modified base data') total_batches = 0 for in_mod_db_fn in in_mod_db_fns: in_mods_db = mods.ModsDb(in_mod_db_fn) total_batches += (in_mods_db.get_num_uniq_stats() // batch_size) + 1 in_mods_db.close() bar = tqdm(desc='Data Batches', unit='Batches', total=total_batches, smoothing=0, dynamic_ncols=True) for in_mod_db_fn in in_mod_db_fns: in_mods_db = mods.ModsDb(in_mod_db_fn) batch_data = [] for score, uuid, mod_base, in_pos_dbid in in_mods_db.iter_data(): out_pos_dbid = out_mods_db.get_pos_dbid( *in_mods_db.get_pos(in_pos_dbid)) batch_data.append( (score, out_pos_dbid, out_mods_db.get_mod_base_dbid(mod_base), out_mods_db.get_read_dbid(uuid))) if len(batch_data) >= batch_size: out_mods_db.insert_batch_data(batch_data) batch_data = [] bar.update() if len(batch_data) > 0: out_mods_db.insert_batch_data(batch_data) bar.update() in_mods_db.close() bar.close()
def extract_data_worker(in_db_fns_q, data_conn, out_mods_db_fn, batch_size): # load output database with uuid in-memory indices out_mods_db = mods.ModsDb(out_mods_db_fn, in_mem_uuid_to_dbid=True) while True: try: in_mod_db_fn = in_db_fns_q.get(block=True, timeout=0.1) except queue.Empty: sleep(0.001) continue if in_mod_db_fn is None: break in_mods_db = mods.ModsDb(in_mod_db_fn) batch_data = [] for score, uuid, mod_base, in_pos_dbid in in_mods_db.iter_data(): out_pos_dbid = out_mods_db.get_pos_dbid( *in_mods_db.get_pos(in_pos_dbid)) batch_data.append( (score, out_pos_dbid, out_mods_db.get_mod_base_dbid(mod_base), out_mods_db.get_read_dbid(uuid))) if len(batch_data) >= batch_size: data_conn.put(batch_data) batch_data = [] if len(batch_data) > 0: data_conn.put(batch_data) batch_data = [] in_mods_db.close() out_mods_db.close()
def main(): args = get_parser().parse_args() old_db = sqlite3.connect(args.old_db) old_cur = old_db.cursor() new_db = mods.ModsDb(args.new_db, read_only=False, pos_index_in_memory=True) sys.stderr.write('Reading/loading reference record names.\n') fill_refs(old_cur, new_db) sys.stderr.write('Reading/loading modified base scores.\n') fill_mods(old_cur, new_db) if not DEBUG: new_db.create_mod_index() t0 = time() sys.stderr.write('Creating positions index.\n') new_db.create_pos_index() t1 = time() sys.stderr.write('Took {} seconds.\n'.format(t1 - t0)) sys.stderr.write('Creating scores position index.\n') new_db.create_data_covering_index() sys.stderr.write('Took {} seconds.\n'.format(time() - t1)) new_db.close()
def _main(args): logging.init_logger() old_db = sqlite3.connect(args.old_db) old_cur = old_db.cursor() new_db = mods.ModsDb(args.new_db, read_only=False, pos_index_in_memory=True) LOGGER.info('Reading/loading reference record names.') fill_refs(old_cur, new_db) LOGGER.info('Reading/loading modified base scores.') fill_mods(old_cur, new_db) if not DEBUG: new_db.create_mod_index() t0 = time() LOGGER.info('Creating positions index.') new_db.create_pos_index() t1 = time() LOGGER.info('Took {} seconds.'.format(t1 - t0)) LOGGER.info('Creating scores position index.') new_db.create_data_covering_index() LOGGER.info('Took {} seconds.'.format(time() - t1)) new_db.close()
def insert_pos_mp(in_mod_db_fns, out_mods_db, batch_size): LOGGER.info('Merging pos tables using multiprocessing') total_batches = 0 pos_q = mp.Queue(maxsize=QUEUE_SIZE_LIMIT) pos_ps = [] for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) total_batches += (mods_db.get_num_uniq_mod_pos() // batch_size) + 1 mods_db.close() p = mp.Process( target=extract_pos_worker, args=(in_mod_db_fn, batch_size, pos_q), daemon=True) p.start() pos_ps.append(p) bar = tqdm(desc='Position Batches', total=total_batches, smoothing=0, dynamic_ncols=True) while any(p.is_alive() for p in pos_ps): try: pos_batch = pos_q.get(block=True, timeout=1) except queue.Empty: sleep(0.001) continue insert_pos_data(pos_batch, out_mods_db) bar.update() while not pos_q.empty(): pos_batch = pos_q.get(block=False) insert_pos_data(pos_batch, out_mods_db) bar.update() bar.close()
def insert_reads_mp(in_mod_db_fns, out_mods_db, batch_size): LOGGER.info('Merging read uuid tables using multiprocessing') total_batches = 0 uuids_q = mp.Queue() uuids_ps = [] for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) total_batches += (mods_db.get_num_uniq_reads() // batch_size) + 1 mods_db.close() p = mp.Process( target=extract_reads_worker, args=(in_mod_db_fn, batch_size, uuids_q), daemon=True) p.start() uuids_ps.append(p) bar = tqdm(desc='UUID Batches', total=total_batches, smoothing=0, dynamic_ncols=True) while any(p.is_alive() for p in uuids_ps): try: uuids_batch = uuids_q.get(block=True, timeout=1) except queue.Empty: sleep(0.001) continue out_mods_db.get_read_dbids_or_insert(uuids_batch) bar.update() while not uuids_q.empty(): uuids_batch = uuids_q.get(block=False) out_mods_db.get_read_dbids_or_insert(uuids_batch) bar.update() bar.close()
def _main(args): mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)) mods_txt_fp = open( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME) if args.out_filename is None else args.out_filename, 'w') mods_txt_fp.write('\t'.join(mods_db.text_field_names) + '\n') for pos_id, pos_chrm, strand, pos in tqdm( mods_db.iter_pos(), total=mods_db.get_num_uniq_mod_pos(), smoothing=0): pr_mod_stats = mods_db.get_pos_stats( (pos_id, pos_chrm, strand, pos), return_uuids=True) mod_type_stats = defaultdict(dict) for r_stats in pr_mod_stats: mod_type_stats[r_stats.read_id][r_stats.mod_base] = ( r_stats.score, r_stats.raw_motif, r_stats.motif_pos, r_stats.chrm) mod_out_text = '' for read_id, r_mod_stats in mod_type_stats.items(): mod_lps = np.array(list(zip(*r_mod_stats.values()))[0]) with np.errstate(divide='ignore'): can_lp = np.log1p(-np.exp(mod_lps).sum()) mod_out_text += '\n'.join(( ('\t'.join('{}' for _ in mods_db.text_field_names)).format( read_id, chrm, strand, pos, mod_lp, can_lp, mod_base, '{}:{}'.format(raw_motif, motif_pos)) for mod_base, (mod_lp, raw_motif, motif_pos, chrm) in r_mod_stats.items())) + '\n' mods_txt_fp.write(mod_out_text)
def check_matching_attrs(ground_truth_bed, strand_offset, mod_db_fn, target_mod_bases, limit=10000): mods_db = mods.ModsDb(mod_db_fn) db_strands = (1, -1) if strand_offset is None else (None, ) db_chrms = set((chrm, strand) for _, chrm, _ in mods_db.iter_chrms() for strand in db_strands) cov, mod_cov = mh.parse_bed_methyls([ ground_truth_bed, ], strand_offset, show_prog_bar=False, limit=limit) if len(db_chrms.intersection(cov.keys())) == 0: LOGGER.error(('Using first {} sites from {}, found zero overlapping ' + 'contig/chromosome names with the mod database.').format( limit, ground_truth_bed)) LOGGER.info('Database contigs/chromosomes: {}'.format(', '.join( map(str, db_chrms)))) LOGGER.info('BED methyl contigs/chromosomes: {}'.format(', '.join( map(str, list(cov.keys()))))) raise mh.MegaError('No overlapping contigs found.') db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names()) for tmb in target_mod_bases: if tmb not in db_mods: raise mh.MegaError( ('Target modified base, {}, not found in mods database ' + '({}).').format(tmb, ', '.join(db_mods))) mods_db.check_data_covering_index_exists() mods_db.close()
def _main(args): raise NotImplementedError( 'The previous version of this script updated version 0 to ' + 'version 1. Updgreade to version 2 not yet implemented.') logging.init_logger() old_db = sqlite3.connect(args.old_db) old_cur = old_db.cursor() new_db = mods.ModsDb(args.new_db, read_only=False) LOGGER.info('Reading/loading reference record names.') fill_refs(old_cur, new_db) LOGGER.info('Reading/loading modified base scores.') fill_mods(old_cur, new_db) if not DEBUG: new_db.create_mod_index() t0 = time() LOGGER.info('Creating positions index.') new_db.create_pos_index() t1 = time() LOGGER.info('Took {} seconds.'.format(t1 - t0)) LOGGER.info('Creating scores position index.') new_db.create_data_covering_index() LOGGER.info('Took {} seconds.'.format(time() - t1)) new_db.close()
def _main(args): logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) # parse motifs motifs = parse_motifs(args.motif) # open indexed FASTA reference ref = pysam.FastaFile(args.reference) LOGGER.info('Extracting mods and chrms from input database') in_mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)) alphabet, _, mod_long_names = in_mods_db.get_alphabet_info() ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:] LOGGER.info('Extracting read uuid table') in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()] LOGGER.info('Opening new per-read modified base statistics databases') model_info = backends.DetachedModelInfo(alphabet=alphabet, mod_long_names=mod_long_names) out_mods_dbs = [] for motif_info in motifs: out_dir = '{}.{}_{}'.format(args.output_prefix, motif_info.raw_motif, motif_info.bases_before) mh.mkdir(out_dir, overwrite=False) mods_info = mods.ModInfo(model_info, out_dir=out_dir) mods.init_mods_db(mods_info, ref_names_and_lens) out_mods_dbs.append((mods.ModsDb(mods_info.mods_db_fn, read_only=False), motif_info)) out_mods_dbs[-1][0].insert_uuids(in_uuids) out_mods_dbs[-1][0].commit() # commit so read uuids are available to worker processes LOGGER.info('Inserting per-read calls from input databases') split_data(in_mods_db, out_mods_dbs, ref) # TOOD do this in separate processes LOGGER.info( 'Creating data covering indices for efficient iteration by position') for out_mods_db, _ in out_mods_dbs: out_mods_db.create_data_covering_index() out_mods_db.commit() out_mods_db.close() LOGGER.info('Finished indexing {}'.format(out_mods_db.fn))
def insert_reads(in_mod_db_fns, out_mods_db): LOGGER.info('Merging read uuid tables') for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_reads(), smoothing=0, dynamic_ncols=True) for read_dbid, uuid in mods_db.iter_uuids(): out_mods_db.get_read_dbid_or_insert(uuid) bar.update() mods_db.close() bar.close()
def extract_reads_worker(in_mod_db_fn, batch_size, uuids_q): mods_db = mods.ModsDb(in_mod_db_fn) uuids_batch = [] for read_dbid, uuid in mods_db.iter_uuids(): uuids_batch.append(uuid) if len(uuids_batch) >= batch_size: uuids_q.put(uuids_batch) uuids_batch = [] if len(uuids_batch) >= 0: uuids_q.put(uuids_batch) mods_db.close()
def insert_reads(in_mod_db_fns, out_mods_db): LOGGER.info('Merging read uuid tables') in_uuids = set() for in_mod_db_fn in tqdm(in_mod_db_fns, desc='Databases', unit='DBs', smoothing=0, dynamic_ncols=True): in_mods_db = mods.ModsDb(in_mod_db_fn) in_uuids.update(uuid for _, uuid in in_mods_db.iter_uuids()) in_mods_db.close() out_mods_db.insert_uuids(in_uuids)
def extract_mods(in_mod_db_fns): LOGGER.info('Merging mod tables') # collect modified base definitions from input databases mod_base_to_can = dict() for in_mod_db_fn in tqdm(in_mod_db_fns, desc='Databases', unit='DBs', smoothing=0, dynamic_ncols=True): mods_db = mods.ModsDb(in_mod_db_fn) for mod_base, can_base, mln in mods_db.get_full_mod_data(): if mod_base in mod_base_to_can and \ (can_base, mln) != mod_base_to_can[mod_base]: raise mh.MegaError( 'Modified base associated with mutliple canonical bases ' + 'or long names in different databases. {} != {}'.format( str((can_base, mln)), str(mod_base_to_can[mod_base]))) mod_base_to_can[mod_base] = (can_base, mln) # check that mod long names are unique mlns = [mln for _, mln in mod_base_to_can.values()] if len(mlns) != len(set(mlns)): raise mh.MegaError( 'Modified base long name assigned to more than one modified ' + 'base single letter code.') # extract canonical bases associated with modified base can_bases = set(can_base for can_base, _ in mod_base_to_can.values()) # determine first valid canonical alphabet compatible with database can_alphabet = None for v_alphabet in mh.VALID_ALPHABETS: if len(can_bases.difference(v_alphabet)) == 0: can_alphabet = v_alphabet break if can_alphabet is None: LOGGER.error( 'Mods database does not contain valid canonical bases ({})'.format( ''.join(sorted(can_bases)))) raise mh.MegaError('Invalid alphabet.') # compute full output alphabet and ordered modified base long names can_base_to_mods = dict( (can_base, [(mod_base, mln) for mod_base, (mcan_base, mln) in mod_base_to_can.items() if mcan_base == can_base]) for can_base in can_alphabet) alphabet = '' mod_long_names = [] for can_base in can_alphabet: alphabet += can_base for mod_base, mln in can_base_to_mods[can_base]: alphabet += mod_base mod_long_names.append(mln) return alphabet, mod_long_names
def extract_pos_worker(in_mod_db_fn, batch_size, pos_q): mods_db = mods.ModsDb(in_mod_db_fn) pos_batch = init_pos_dict(mods_db) num_pos = 0 for _, chrm_dbid, strand, pos in mods_db.iter_pos(): pos_batch[(mods_db.get_chrm(chrm_dbid)[0], strand)].append(pos) num_pos += 1 if num_pos >= batch_size: pos_q.put(pos_batch) pos_batch = init_pos_dict(mods_db) num_pos = 0 if num_pos >= 0: pos_q.put(pos_batch) mods_db.close()
def _main(args): logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""') mods_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME) mods_db = mods.ModsDb(mods_db_fn, read_only=False) try: mods_db.check_data_covering_index_exists() LOGGER.info("Modified bases database index already exists") except mh.MegaError: LOGGER.info("Creating modified bases database index") mods_db.create_data_covering_index() LOGGER.debug("Closing database") mods_db.close()
def _main(args): mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME), in_mem_dbid_to_uuid=True, ) mods_txt_fp = open( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME) if args.out_filename is None else args.out_filename, "w", ) mods_txt_fp.write("\t".join(mods_db.text_field_names) + "\n") rec_tmplt = "\t".join("{}" for _ in mods_db.text_field_names) + "\n" bar = tqdm( desc="Processing Per-read Data", unit="per-read calls", total=mods_db.get_num_uniq_stats(), smoothing=0, dynamic_ncols=True, ) for (chrm, strand, pos), pos_lps in mods_db.iter_pos_scores(convert_pos=True): bar.update(len(pos_lps)) str_strand = mh.int_strand_to_str(strand) mod_out_text = "" prev_dbid = None mod_bs, r_lps = [], [] for read_dbid, mod_dbid, lp in sorted(pos_lps): if prev_dbid != read_dbid and prev_dbid is not None: uuid = mods_db.get_uuid(prev_dbid) # compute and store log likelihood ratios with np.errstate(divide="ignore"): can_lp = np.log1p(-np.exp(r_lps).sum()) for mod_b, r_lp in zip(mod_bs, r_lps): mod_out_text += rec_tmplt.format(uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b) mod_bs, r_lps = [], [] prev_dbid = read_dbid mod_bs.append(mods_db.get_mod_base(mod_dbid)) r_lps.append(lp) uuid = mods_db.get_uuid(prev_dbid) # compute and store log likelihood ratios with np.errstate(divide="ignore"): can_lp = np.log1p(-np.exp(r_lps).sum()) for mod_b, r_lp in zip(mod_bs, r_lps): mod_out_text += rec_tmplt.format(uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b) mods_txt_fp.write(mod_out_text) mods_txt_fp.close()
def insert_mods(in_mod_db_fns, out_mods_db): LOGGER.info('Merging mod tables') all_mod_long_names = set() for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) all_mod_long_names.update(mods_db.get_mod_long_names()) bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_mods(), smoothing=0, dynamic_ncols=True) for (_, mod_base, motif, motif_pos, raw_motif) in mods_db.iter_mod_bases(): out_mods_db.get_mod_base_dbid_or_insert( mod_base, motif, motif_pos, raw_motif) bar.update() mods_db.close() bar.close() out_mods_db.insert_mod_long_names(list(all_mod_long_names))
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) LOGGER.info('Opening new modified base statistics database') out_mods_db_fn = mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME) out_mods_db = mods.ModsDb( out_mods_db_fn, read_only=False, init_db_tables=True, in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True, in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True, force_uint32_pos_to_dbid=args.force_uint32_pos_index, db_safety=args.database_safety) in_mod_db_fns = [mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME) for mega_dir in args.megalodon_results_dirs] LOGGER.info( 'Merging will proceed in five stages:\n\t1) chrmosomes\n\t2) ' + 'modified base definitions\n\t3) read identifiers\n\t4) reference ' + 'positions\n\t5) modified base statistics') insert_chrms(in_mod_db_fns, out_mods_db) insert_mods(in_mod_db_fns, out_mods_db) if args.single_process: insert_reads(in_mod_db_fns, out_mods_db) else: insert_reads_mp(in_mod_db_fns, out_mods_db, args.data_batch_size) if args.single_process: insert_pos(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_pos_mp(in_mod_db_fns, out_mods_db, args.data_batch_size) out_mods_db.db.commit() if args.single_process: insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_data_mp( in_mod_db_fns, out_mods_db, out_mods_db_fn, args.data_batch_size, args.max_processes, args.force_uint32_pos_index, db_safety=args.database_safety) out_mods_db.db.commit() LOGGER.info( 'Creating data covering index for efficient searching by position') out_mods_db.create_data_covering_index() out_mods_db.db.commit() out_mods_db.close()
def insert_chrms(in_mod_db_fns, out_mods_db): LOGGER.info('Merging chrm tables') ref_names_and_lens = [[], []] for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_chrms(), smoothing=0, dynamic_ncols=True) for _, chrm, chrm_len in mods_db.iter_chrms(): if chrm not in ref_names_and_lens[0]: ref_names_and_lens[0].append(chrm) ref_names_and_lens[1].append(chrm_len) bar.update() mods_db.close() bar.close() # insert chrms at the end to avoid errors for in memory position datasets out_mods_db.insert_chrms(ref_names_and_lens) out_mods_db.create_chrm_index()
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) LOGGER.info('Extracting mods and chrms from input databases') in_mod_db_fns = [ mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME) for mega_dir in args.megalodon_results_dirs ] alphabet, mod_long_names = extract_mods(in_mod_db_fns) ref_names_and_lens = extract_chrms(in_mod_db_fns) LOGGER.info('Opening new per-read modified base statistics database') model_info = backends.DetachedModelInfo(alphabet=alphabet, mod_long_names=mod_long_names) mods_info = mods.ModInfo(model_info, out_dir=args.output_megalodon_results_dir) mods.init_mods_db(mods_info, ref_names_and_lens) # load uuids in memory in main out db only in single process mode. # else worker threads only have to load uuid lookup tables out_mods_db = mods.ModsDb(mods_info.mods_db_fn, read_only=False, in_mem_uuid_to_dbid=args.single_process) LOGGER.info('Inserting read UUIDs from input databases') if args.single_process: insert_reads(in_mod_db_fns, out_mods_db) else: insert_reads_mp(in_mod_db_fns, out_mods_db) # commit so read uuids are available to worker processes out_mods_db.commit() LOGGER.info('Inserting per-read calls from input databases') if args.single_process: insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_data_mp(in_mod_db_fns, out_mods_db, mods_info.mods_db_fn, args.data_batch_size, args.max_processes) out_mods_db.commit() LOGGER.info( 'Creating data covering index for efficient iteration by position') out_mods_db.create_data_covering_index() out_mods_db.commit() out_mods_db.close()
def insert_pos(in_mod_db_fns, out_mods_db, batch_size): LOGGER.info('Merging pos tables') for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) num_pos = 0 pos_batch = init_pos_dict(mods_db) bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_mod_pos(), smoothing=0, dynamic_ncols=True) for _, chrm_dbid, strand, pos in mods_db.iter_pos(): pos_batch[(mods_db.get_chrm(chrm_dbid)[0], strand)].append(pos) num_pos += 1 if num_pos >= batch_size: insert_pos_data(pos_batch, out_mods_db) num_pos = 0 pos_batch = init_pos_dict(mods_db) bar.update() if num_pos > 0: insert_pos_data(pos_batch, out_mods_db) mods_db.close() bar.close()
def main(): args = get_parser().parse_args() mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)) scores = [] num_pos = (mods_db.get_num_uniq_mod_pos() if args.num_positions is None else args.num_positions) for n_pos, (pos_id, pos_chrm, strand, pos) in tqdm(enumerate(mods_db.iter_pos()), total=num_pos, smoothing=0): pr_mod_stats = mods_db.get_pos_stats((pos_id, pos_chrm, strand, pos), return_uuids=True) mod_type_stats = defaultdict(dict) for r_stats in pr_mod_stats: mod_type_stats[r_stats.read_id][r_stats.mod_base] = r_stats.score for r_mod_stats in mod_type_stats.values(): mod_lps = np.array(list(r_mod_stats.values())) with np.errstate(divide='ignore'): can_lp = np.log1p(-np.exp(mod_lps).sum()) for mod_base, mod_lp in r_mod_stats.items(): if mod_base != args.mod_base: continue scores.append(can_lp - mod_lp) if args.num_positions is not None and n_pos >= args.num_positions: break scores = np.array(scores) frac_mod = args.fraction_modified if frac_mod is None: thresh_vals = np.percentile( scores, (args.mod_percentile, 100 - args.mod_percentile)) thresh_val = np.abs(thresh_vals).min() n_can = np.greater_equal(scores, thresh_val).sum() n_mod = np.less_equal(scores, -thresh_val).sum() frac_mod = n_mod / (n_mod + n_can) print('Fraction mod: {}'.format(frac_mod)) llr_thresh = np.percentile(scores, frac_mod * 100) print('Threshold: {}'.format(llr_thresh))
def insert_data(in_mod_db_fns, out_mods_db, batch_size): LOGGER.info('Inserting modified base data') for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) bar = tqdm( desc=in_mod_db_fn, total=mods_db.get_num_uniq_stats(), smoothing=0, dynamic_ncols=True) batch_data = [] for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos, chrm, chrm_len) in mods_db.iter_data(): batch_data.append((score, *get_data_dbids( out_mods_db, chrm, strand, pos, (mod_base, motif, motif_pos, raw_motif), uuid))) if len(batch_data) >= batch_size: out_mods_db.insert_read_data(batch_data) batch_data = [] bar.update() if len(batch_data) > 0: out_mods_db.insert_read_data(batch_data) mods_db.close() bar.close()
def insert_data_mp( in_mod_db_fns, out_mods_db, out_mods_db_fn, batch_size, max_proc, force_uint32, db_safety): LOGGER.info('Merging modified base data using multiprocessing') num_proc = min(max_proc, len(in_mod_db_fns)) in_db_fns_q = mp.Queue() for in_mod_db_fn in in_mod_db_fns: in_db_fns_q.put(in_mod_db_fn) for _ in range(num_proc): in_db_fns_q.put(None) data_q = mp.Queue(maxsize=QUEUE_SIZE_LIMIT) data_ps = [] for _ in range(num_proc): p = mp.Process( target=extract_data_worker, args=(in_db_fns_q, data_q, out_mods_db_fn, batch_size, force_uint32, db_safety), daemon=True) p.start() data_ps.append(p) total_batches = 0 for in_mod_db_fn in in_mod_db_fns: mods_db = mods.ModsDb(in_mod_db_fn) total_batches += (mods_db.get_num_uniq_stats() // batch_size) + 1 mods_db.close() bar = tqdm(desc='Statistics Batches', total=total_batches, smoothing=0, dynamic_ncols=True) while any(p.is_alive() for p in data_ps): try: batch_data = data_q.get(block=True, timeout=1) except queue.Empty: sleep(0.001) continue out_mods_db.insert_read_data(batch_data) bar.update() while not data_q.empty(): batch_data = data_q.get(block=False) out_mods_db.insert_read_data(batch_data) bar.update() bar.close()
def insert_data_mp(in_mod_db_fns, out_mods_db, out_mods_db_fn, batch_size, max_proc): LOGGER.info('Merging modified base data using multiprocessing') num_proc = min(max_proc, len(in_mod_db_fns)) in_db_fns_q = mp.Queue() for in_mod_db_fn in in_mod_db_fns: in_db_fns_q.put(in_mod_db_fn) for _ in range(num_proc): in_db_fns_q.put(None) data_q = mega_mp.SimplexManyToOneQueue(max_size=QUEUE_SIZE_LIMIT) data_ps = [] for _ in range(num_proc): data_conn = data_q.get_conn() p = mp.Process(target=extract_data_worker, args=(in_db_fns_q, data_conn, out_mods_db_fn, batch_size), daemon=True) p.start() data_conn.close() del data_conn data_ps.append(p) total_batches = 0 for in_mod_db_fn in in_mod_db_fns: in_mods_db = mods.ModsDb(in_mod_db_fn) total_batches += (in_mods_db.get_num_uniq_stats() // batch_size) + 1 in_mods_db.close() bar = tqdm(desc='Data Batches', unit='Batches', total=total_batches, smoothing=0, dynamic_ncols=True) while data_q.has_valid_conns: for batch_data in data_q.wait_recv(): out_mods_db.insert_batch_data(batch_data) bar.update() bar.close()
def extract_chrms(in_mod_db_fns): LOGGER.info('Merging chrm tables') ref_names_and_lens = [[], []] for in_mod_db_fn in tqdm(in_mod_db_fns, desc='Databases', unit='DBs', smoothing=0, dynamic_ncols=True): mods_db = mods.ModsDb(in_mod_db_fn) for _, chrm, chrm_len in mods_db.iter_chrms(): if chrm in ref_names_and_lens[0]: prev_chrm_len = ref_names_and_lens[1][ ref_names_and_lens[0].index(chrm)] if prev_chrm_len != chrm_len: raise mh.MegaError( ('Chromosome lengths from databases do not agree ' + 'for {}: {} != {}').format(chrm, prev_chrm_len, chrm_len)) else: ref_names_and_lens[0].append(chrm) ref_names_and_lens[1].append(chrm_len) mods_db.close() return ref_names_and_lens
def _main(args): logging.init_logger() LOGGER.info('Loading database position statistics') mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)) db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names()) if args.mod_base not in db_mods: raise mh.MegaError('Target modified base not found in mods database.') scores = [] bar = tqdm(total=args.num_statistics, smoothing=0) for (chrm, strand, pos), mod_llrs in mods_db.iter_pos_scores(convert_pos=True, compute_llrs=True): for mod_base, reads_llrs in mod_llrs.items(): if mod_base != args.mod_base: continue bar.update(len(reads_llrs)) scores.extend(reads_llrs) if args.num_statistics is not None and bar.n >= args.num_statistics: break LOGGER.info('Esitmating fraction of modified bases') scores = np.array(scores) frac_mod = args.fraction_modified if frac_mod is None: thresh_vals = np.percentile( scores, (args.mod_percentile, 100 - args.mod_percentile)) thresh_val = np.abs(thresh_vals).min() n_can = np.greater_equal(scores, thresh_val).sum() n_mod = np.less_equal(scores, -thresh_val).sum() frac_mod = n_mod / (n_mod + n_can) print('Fraction mod: {}'.format(frac_mod)) llr_thresh = np.percentile(scores, frac_mod * 100) print('Threshold: {}'.format(llr_thresh))