Esempio n. 1
0
    def run_model(self, raw_sig, n_can_state=None):
        if self.model_type == TAI_NAME:
            if any(arg is None for arg in (self.chunk_size, self.chunk_overlap,
                                           self.max_concur_chunks)):
                logger = logging.get_logger()
                logger.error('Must provide chunk_size, chunk_overlap, ' +
                             'max_concur_chunks in order to run the taiyaki ' +
                             'base calling backend.')
            try:
                trans_weights = self.tai_run_model(raw_sig, self.model,
                                                   self.chunk_size,
                                                   self.chunk_overlap,
                                                   self.max_concur_chunks)
            except AttributeError:
                raise mh.MegaError('Out of date or incompatible model')
            except RuntimeError:
                raise mh.MegaError('Likely out of memory error.')
            if self.device != self.torch.device('cpu'):
                self.torch.cuda.empty_cache()
            if n_can_state is not None:
                trans_weights = (np.ascontiguousarray(
                    trans_weights[:, :n_can_state]),
                                 np.ascontiguousarray(
                                     trans_weights[:, n_can_state:]))
        else:
            raise mh.MegaError('Invalid model type.')

        return trans_weights
Esempio n. 2
0
def snps_validation(args, is_cat_mod, output_size, aligner):
    logger = logging.get_logger()
    if mh.WHATSHAP_MAP_NAME in args.outputs and not mh.SNP_NAME in args.outputs:
        args.outputs.append(mh.SNP_NAME)
    if mh.SNP_NAME in args.outputs and not mh.PR_SNP_NAME in args.outputs:
        args.outputs.append(mh.PR_SNP_NAME)
    if mh.PR_SNP_NAME in args.outputs and args.variant_filename is None:
        logger.error('{} output requested, '.format(mh.PR_SNP_NAME) +
                     'but --variant-filename provided.')
        sys.exit(1)
    if mh.PR_SNP_NAME in args.outputs and not (
            is_cat_mod or mh.nstate_to_nbase(output_size) == 4):
        logger.error(
            'SNP calling from naive modified base flip-flop model is ' +
            'not supported.')
        sys.exit(1)
    snp_calib_fn = mh.get_snp_calibration_fn(args.snp_calibration_filename,
                                             args.disable_snp_calibration)
    try:
        snps_data = snps.SnpData(
            args.variant_filename, args.max_indel_size, args.snp_all_paths,
            args.write_snps_text, args.variant_context_bases, snp_calib_fn,
            snps.HAPLIOD_MODE if args.haploid else snps.DIPLOID_MODE,
            args.refs_include_snps, aligner)
    except mh.MegaError as e:
        logger.error(str(e))
        sys.exit(1)
    if args.variant_filename is not None and mh.PR_SNP_NAME not in args.outputs:
        logger.warning(
            '--snps-filename provided, but SNP output not requested ' +
            '(via --outputs). Argument will be ignored.')
    return args, snps_data
Esempio n. 3
0
def parse_pr_ref_output(args):
    logger = logging.get_logger()
    if args.output_per_read_references:
        args.outputs.append(mh.PR_REF_NAME)
        if args.refs_include_snps and args.refs_include_mods:
            logger.error('Cannot output both modified base and SNPs in ' +
                         'per-read references (remove one of ' +
                         '--refs-include-snps or --refs-include-mods).')
            sys.exit(1)
        if args.refs_include_snps and mh.PR_SNP_NAME not in args.outputs:
            args.outputs.append(mh.PR_SNP_NAME)
            logger.warning('--refs-include-snps set, so adding ' +
                           'per_read_snps to --outputs.')
        if args.refs_include_mods and mh.PR_MOD_NAME not in args.outputs:
            args.outputs.append(mh.PR_MOD_NAME)
            logger.warning('--refs-include-mods set, so adding ' +
                           'per_read_mods to --outputs.')
    else:
        if args.refs_include_snps:
            logger.warning(
                '--refs-include-snps but not --output-per-read-references ' +
                'set. Ignoring --refs-include-snps.')
        if args.refs_include_mods:
            logger.warning(
                '--refs-include-mods but not --output-per-read-references ' +
                'set. Ignoring --refs-include-mods.')
    min_len, max_len = (args.refs_length_range if args.refs_length_range
                        is not None else (None, None))
    pr_ref_filts = mh.PR_REF_FILTERS(
        pct_idnt=args.refs_percent_identity_threshold,
        pct_cov=args.refs_percent_coverage_threshold,
        min_len=min_len,
        max_len=max_len)

    return args, pr_ref_filts
Esempio n. 4
0
def _fill_files_queue(read_file_q, fast5s_dir, num_reads, read_ids_fn,
                      recursive, num_ps, num_reads_conn):
    logger = logging.get_logger()
    valid_read_ids = None
    if read_ids_fn is not None:
        with open(read_ids_fn) as read_ids_fp:
            valid_read_ids = set(line.strip() for line in read_ids_fp)
    used_read_ids = set()
    # fill queue with read filename and read id tuples
    for fast5_fn, read_id in fast5_io.iterate_fast5_reads(
            fast5s_dir, num_reads, recursive):
        if valid_read_ids is not None and read_id not in valid_read_ids:
            continue
        if read_id in used_read_ids:
            logger.debug(('Read ID ({}) found in previous read and will not ' +
                          'process from {}.').format(read_id, fast5_fn))
            continue
        if fast5_fn is None or read_id is None:
            continue
        read_file_q.put((fast5_fn, read_id))
        used_read_ids.add(read_id)
    # add None to indicate that read processes should return
    for _ in range(num_ps):
        read_file_q.put((None, None))
    num_reads_conn.send(len(used_read_ids))

    return
Esempio n. 5
0
def aligner_validation(args):
    logger = logging.get_logger()
    if len(mh.ALIGN_OUTPUTS.intersection(args.outputs)) > 0:
        if args.reference is None:
            logger.error(
                ('Output(s) requiring reference alignment requested ({}), ' +
                 'but --reference not provided.').format(', '.join(
                     mh.ALIGN_OUTPUTS.intersection(args.outputs))))
            sys.exit(1)
        logger.info('Loading reference.')
        if not (os.path.exists(args.reference)
                and os.path.isfile(args.reference)):
            logger.error('Provided reference file does not exist or is ' +
                         'not a file.')
            sys.exit(1)
        aligner = mapping.alignerPlus(str(args.reference),
                                      preset=str('map-ont'),
                                      best_n=1)
        setattr(aligner, 'out_fmt', args.mappings_format)
        setattr(aligner, 'ref_fn', mh.resolve_path(args.reference))
        aligner.add_ref_lens()
        mapping.test_open_alignment_out_file(args.output_directory,
                                             aligner.out_fmt,
                                             aligner.ref_names_and_lens,
                                             aligner.ref_fn)
    else:
        aligner = None
        if args.reference is not None:
            logger.warning(
                '[--reference] provided, but no [--outputs] requiring ' +
                'alignment was requested. Argument will be ignored.')
    return aligner
Esempio n. 6
0
    def add_diploid_probs(self, probs, gts):
        # phred scaled likelihoods
        with np.errstate(divide='ignore'):
            gl = np.log10(probs)
        raw_pl = -10 * gl
        # "normalized" PL values stored as decsribed by VCF format
        pl = np.minimum(raw_pl - raw_pl.min(), mh.MAX_PL_VALUE)
        s_pl = np.sort(pl)

        # add sample tags
        self.add_sample_field('GT', gts[np.argmax(probs)])
        try:
            qual = int(np.minimum(np.around(raw_pl[0]), mh.MAX_PL_VALUE))
        except ValueError:
            logger = logging.get_logger()
            logger.debug(
                'NAN quality value encountered. gts:{}, probs:{}'.format(
                    str(gts), str(probs)))
            qual = mh.MAX_PL_VALUE
        self.qual = '{:.0f}'.format(qual) if qual > 0 else '.'
        self.add_sample_field('GQ', '{:.0f}'.format(np.around(s_pl[1])))
        self.add_sample_field(
            'GL',
            ','.join('{:.2f}' for _ in range(probs.shape[0])).format(*gl))
        self.add_sample_field(
            'PL',
            ','.join('{:.0f}'
                     for _ in range(probs.shape[0])).format(*np.around(pl)))
        return
Esempio n. 7
0
    def __init__(self,
                 variant_fn,
                 max_indel_size,
                 all_paths,
                 write_snps_txt,
                 context_bases,
                 snps_calib_fn=None,
                 call_mode=DIPLOID_MODE,
                 do_pr_ref_snps=False,
                 aligner=None,
                 keep_snp_fp_open=False):
        logger = logging.get_logger('snps')
        self.max_indel_size = max_indel_size
        self.all_paths = all_paths
        self.write_snps_txt = write_snps_txt
        self.snps_calib_fn = snps_calib_fn
        self.calib_table = calibration.SnpCalibrator(self.snps_calib_fn)
        self.context_bases = context_bases
        if len(self.context_bases) != 2:
            raise mh.MegaError(
                'Must provide 2 context bases values (for single base SNPs ' +
                'and indels).')
        self.call_mode = call_mode
        self.do_pr_ref_snps = do_pr_ref_snps
        self.variant_fn = variant_fn
        self.variants_idx = None
        if self.variant_fn is None:
            return

        logger.info('Loading variants.')
        vars_idx = pysam.VariantFile(self.variant_fn)
        try:
            contigs = list(vars_idx.header.contigs.keys())
            vars_idx.fetch(next(iter(contigs)), 0, 0)
        except ValueError:
            logger.warn(
                'Variants file must be indexed. Performing indexing now.')
            vars_idx.close()
            self.variant_fn = index_variants(self.variant_fn)
            vars_idx = pysam.VariantFile(self.variant_fn)
        if keep_snp_fp_open:
            self.variants_idx = vars_idx
        else:
            vars_idx.close()
            self.variants_idx = None

        if aligner is None:
            raise mh.MegaError(
                'Must provide aligner if SNP filename is provided')
        if len(set(aligner.ref_names_and_lens[0]).intersection(contigs)) == 0:
            raise mh.MegaError((
                'Reference and variant files contain no chromosomes/contigs ' +
                'in common.\n\t\tFirst 3 reference contigs:\t{}\n\t\tFirst 3 '
                + 'variant file contigs:\t{}').format(
                    ', '.join(aligner.ref_names_and_lens[0][:3]),
                    ', '.join(contigs[:3])))

        return
Esempio n. 8
0
def profile_validation(args):
    logger = logging.get_logger()
    if args.processes > 1:
        msg = ('Running profiling with multiple processes is ' +
               'not allowed. Setting to single process.')
        args.processes = 1
    else:
        msg = 'Running profiling. This may slow processing.'
    logger.warning(msg)
    return args
Esempio n. 9
0
def sort_and_index_mapping(map_fn, out_fn, ref_fn=None, do_index=False):
    sort_args = ['-O', 'BAM', '-o', out_fn, map_fn]
    if ref_fn is not None:
        sort_args.extend(('--reference', ref_fn))
    try:
        pysam.sort(*sort_args)
        if do_index:
            sleep(1)
            pysam.index(out_fn)
    except pysam.utils.SamtoolsError:
        logger = logging.get_logger()
        logger.warning('Sorting and/or indexing mapping failed.')

    return
Esempio n. 10
0
def mkdir(out_dir, overwrite):
    logger = logging.get_logger()
    if os.path.exists(out_dir):
        if not overwrite:
            sys.stderr.write(
                'ERROR: --output-directory exists and --overwrite is ' +
                'not set.\n')
            sys.exit(1)
        if os.path.isfile(out_dir) or os.path.islink(out_dir):
            os.remove(out_dir)
        else:
            shutil.rmtree(out_dir)
    os.mkdir(out_dir)

    return
Esempio n. 11
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)
    logger = logging.get_logger()

    logger.info('Opening new sequence variant statistics database')
    out_vars_db = variants.VarsDb(
        mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME),
        read_only=False,
        loc_index_in_memory=not args.var_locations_on_disk,
        uuid_index_in_memory=True)

    for mega_dir in args.megalodon_results_dirs:
        logger.info(
            'Adding sequence variant statistics from {}'.format(mega_dir))
        # full read only mode with no indices read into memory
        vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir,
                                                      mh.PR_VAR_NAME),
                                  read_only=True,
                                  chrm_index_in_memory=False,
                                  alt_index_in_memory=False,
                                  uuid_index_in_memory=False)
        bar = tqdm(desc=mega_dir,
                   total=vars_db.get_num_uniq_stats(),
                   smoothing=0,
                   dynamic_ncols=True)
        for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end,
             test_start, chrm, chrm_len) in vars_db.iter_data():
            chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len)
            loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start,
                                                      test_end, pos, ref_seq,
                                                      var_name)
            alt_id = out_vars_db.get_alt_id_or_insert(alt_seq)
            read_id = out_vars_db.get_read_id_or_insert(uuid)
            out_vars_db.insert_data(score, loc_id, alt_id, read_id)
            bar.update()
        bar.close()

    logger.info('Creating indices and closing database')
    if out_vars_db.chrm_idx_in_mem:
        out_vars_db.create_chrm_index()
    if out_vars_db.loc_idx_in_mem:
        out_vars_db.create_loc_index()
    if out_vars_db.alt_idx_in_mem:
        out_vars_db.create_alt_index()
    out_vars_db.create_data_covering_index()
    out_vars_db.close()
Esempio n. 12
0
def main():
    args = get_parser().parse_args()
    log_suffix = ('aggregation' if args.output_suffix is None else
                  'aggregation.' + args.output_suffix)
    logging.init_logger(args.output_directory, out_suffix=log_suffix)
    logger = logging.get_logger()

    mod_agg_info = mods.AGG_INFO(mods.BIN_THRESH_NAME,
                                 args.mod_binary_threshold)
    mod_names = []
    if mh.MOD_NAME in args.outputs:
        logger.info('Loading model.')
        mod_names = backends.ModelInfo(
            mh.get_model_fn(args.taiyaki_model_filename)).mod_long_names
    if args.reference is not None: logger.info('Loading reference.')
    aligner = mapping.alignerPlus(str(args.reference),
                                  preset=str('map-ont'),
                                  best_n=1)
    if args.reference is not None:
        aligner.add_ref_lens()
    valid_read_ids = None
    if args.read_ids_filename is not None:
        with open(args.read_ids_filename) as read_ids_fp:
            valid_read_ids = set(line.strip() for line in read_ids_fp)
    aggregate.aggregate_stats(
        args.outputs, args.output_directory, args.processes,
        args.write_vcf_log_probs, args.heterozygous_factors,
        snps.HAPLIOD_MODE if args.haploid else snps.DIPLOID_MODE, mod_names,
        mod_agg_info, args.write_mod_log_probs, args.mod_output_formats,
        args.suppress_progress, aligner.ref_names_and_lens, valid_read_ids,
        args.output_suffix)

    # note reference is required in order to annotate contigs for VCF writing
    if mh.SNP_NAME in args.outputs and args.reference is not None:
        logger.info('Sorting output variant file')
        variant_fn = mh.add_fn_suffix(
            mh.get_megalodon_fn(args.output_directory, mh.SNP_NAME),
            args.output_suffix)
        sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted')
        snps.sort_variants(variant_fn, sort_variant_fn)
        logger.info('Indexing output variant file')
        index_var_fn = snps.index_variants(sort_variant_fn)

    return
Esempio n. 13
0
def mods_validation(args, model_info):
    logger = logging.get_logger()
    if args.refs_include_mods and mh.PR_MOD_NAME not in args.outputs:
        # TODO don't really have to output this data, but have to compute it
        # so sort out how to compute the output but not output it
        args.outputs.append(mh.PR_MOD_NAME)
    if mh.PR_MOD_NAME not in args.outputs and mh.MOD_NAME in args.outputs:
        args.outputs.append(mh.PR_MOD_NAME)
    if mh.PR_MOD_NAME in args.outputs and not model_info.is_cat_mod:
        logger.error(
            '{} output requested, '.format(mh.PR_MOD_NAME) +
            'but model provided is not a categotical modified base model.\n' +
            'Note that modified base calling from naive modified base ' +
            'model is not currently supported.')
        sys.exit(1)
    if (model_info.is_cat_mod and mh.PR_MOD_NAME not in args.outputs and
        mh.BC_MODS_NAME not in args.outputs):
        logger.warning(
            ('Categorical modifications model provided, but neither {} nor ' +
            '{} requested (via --outputs). Modified base output will not be ' +
             'produced.').format( mh.PR_MOD_NAME, mh.BC_MODS_NAME))
    if args.mod_motif is not None and mh.PR_MOD_NAME not in args.outputs:
        logger.warning((
            '--mod-motif provided, but {} not requested (via --outputs). ' +
            'Argument will be ignored.').format(mh.PR_MOD_NAME))
    if args.refs_include_mods and mh.PR_REF_NAME not in args.outputs:
        logger.warning((
            '--refs-include-mods provided, but {} not requested ' +
            '(via --outputs). Argument will be ignored.').format(
                mh.PR_REF_NAME))
    mod_calib_fn = mh.get_mod_calibration_fn(
        args.mod_calibration_filename, args.disable_mod_calibration)
    mods_info = mods.ModInfo(
        model_info, args.mod_motif, args.mod_all_paths,
        args.write_mods_text, args.mod_context_bases,
        mh.BC_MODS_NAME in args.outputs, args.refs_include_mods, mod_calib_fn,
        args.mod_output_formats)
    return args, mods_info
Esempio n. 14
0
import os
import sys
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from megalodon import calibration, logging, megalodon_helper as mh
from ._extras_parsers import get_parser_calibrate_variants

LOGGER = logging.get_logger()
INVALID_CALIB_MSG = (
    'Encountered invalid distributions for calibration. Not saving ' +
    'calibration file, but pdf will be plotted in order to identify ' +
    'potential issues.')


def plot_calib(pdf_fp, var_type, smooth_ls, s_ref, sm_ref, s_alt, sm_alt,
               mono_prob, prob_alt):
    f, axarr = plt.subplots(3, sharex=True, figsize=(11, 7))
    axarr[0].plot(smooth_ls, s_ref, color='orange')
    axarr[0].plot(smooth_ls, sm_ref, color='red')
    axarr[0].plot(smooth_ls, s_alt, color='grey')
    axarr[0].plot(smooth_ls, sm_alt, color='blue')
    axarr[0].set_ylabel(
        'Probability Density\nred/orange=canonical\nblue/grey=modified')
    axarr[0].set_title(var_type + ' Calibration')
    axarr[1].plot(smooth_ls, mono_prob, color='orange')
    axarr[1].plot(smooth_ls, 1 / (np.exp(smooth_ls) + 1), color='purple')
    axarr[1].set_ylabel(
Esempio n. 15
0
def _get_fail_queue(failed_reads_q, f_conn, getter_num_reads_conn,
                    num_update_errors, suppress_progress):
    def update_prog(reads_called, sig_called, unexp_err_fp):
        if is_err:
            failed_reads[err_type].append(fast5_fn)
            if err_type == _UNEXPECTED_ERROR_CODE:
                if len(failed_reads[_UNEXPECTED_ERROR_CODE]) == 1:
                    unexp_err_fp = open(
                        _UNEXPECTED_ERROR_FN.format(np.random.randint(10000)),
                        'w')
                if len(failed_reads[err_type]) >= _MAX_NUM_UNEXP_ERRORS:
                    unexp_err_fp.close()
                else:
                    unexp_err_fp.write(fast5_fn + '\n:::\n' + err_tb +
                                       '\n\n\n')
                    unexp_err_fp.flush()
        if do_update_prog:
            if not suppress_progress:
                try:
                    bar.set_postfix({
                        'ksample/s':
                        (sig_called / 1000) / bar.format_dict['elapsed']
                    })
                except AttributeError:
                    # sometimes get no format_dict error
                    # so don't include ksample/s if so
                    pass
                bar.update(1)
            reads_called += 1
        if num_update_errors > 0:
            bar.write(prog_prefix + format_fail_summ(bar_header, [
                (len(fns), err) for err, fns in failed_reads.items()
            ], reads_called, num_update_errors),
                      file=sys.stderr)

        return reads_called, unexp_err_fp

    logger = logging.get_logger()
    logger.info('Processing reads.')
    reads_called, sig_called = 0, 0
    unexp_err_fp = None
    failed_reads = defaultdict(list)
    bar, prog_prefix, bar_header = prep_errors_bar(num_update_errors, None,
                                                   suppress_progress)
    while True:
        try:
            try:
                (is_err, do_update_prog, err_type, fast5_fn, err_tb,
                 n_sig) = failed_reads_q.get(block=False)
                sig_called += n_sig
                reads_called, unexp_err_fp = update_prog(
                    reads_called, sig_called, unexp_err_fp)
            except queue.Empty:
                # get total number of reads once all reads are enumerated
                if bar is not None and bar.total is None:
                    if getter_num_reads_conn.poll():
                        bar.total = getter_num_reads_conn.recv()
                else:
                    # if all reads are done signal was sent from main thread
                    if f_conn.poll():
                        break
                sleep(0.1)
                continue
        except KeyboardInterrupt:
            # exit gracefully on keyboard inturrupt
            return
    if not suppress_progress: bar.close()

    if len(failed_reads[_UNEXPECTED_ERROR_CODE]) >= 1:
        logger.warning(
            ('Unexpected errors occured. See full ' +
             'error stack traces for first (up to) {0:d} errors in ' +
             '"{1}"').format(_MAX_NUM_UNEXP_ERRORS, unexp_err_fp.name))
    if any(len(fns) > 0 for fns in failed_reads.values()):
        logger.info(
            format_fail_summ(
                'Unsuccessful processing types:',
                [(len(fns), err)
                 for err, fns in failed_reads.items() if len(fns) > 0],
                reads_called))
        # TODO flag to output failed read names to file

    return
Esempio n. 16
0
def _main():
    args = get_parser().parse_args()

    mkdir(args.output_directory, args.overwrite)
    logging.init_logger(args.output_directory)
    logger = logging.get_logger()
    logger.debug('Command: """' + ' '.join(sys.argv) + '"""')

    if _DO_PROFILE:
        args = profile_validation(args)

    args, pr_ref_filts = parse_pr_ref_output(args)
    tai_model_fn = mh.get_model_fn(args.taiyaki_model_filename)
    model_info = backends.ModelInfo(tai_model_fn, args.devices, args.processes,
                                    args.chunk_size, args.chunk_overlap,
                                    args.max_concurrent_chunks)
    args, mods_info = mods_validation(args, model_info)
    aligner = aligner_validation(args)
    args, snps_data = snps_validation(args, model_info.is_cat_mod,
                                      model_info.output_size, aligner)

    process_all_reads(args.fast5s_dir, not args.not_recursive, args.num_reads,
                      args.read_ids_filename, model_info, args.outputs,
                      args.output_directory, args.basecalls_format, aligner,
                      snps_data, args.processes, args.verbose_read_progress,
                      args.suppress_progress, mods_info, args.database_safety,
                      args.edge_buffer, pr_ref_filts)

    if mh.MAP_NAME in args.outputs:
        logger.info('Spawning process to sort mappings')
        map_p = post_process_mapping(args.output_directory, aligner.out_fmt,
                                     aligner.ref_fn)

    if mh.WHATSHAP_MAP_NAME in args.outputs:
        logger.info('Spawning process to sort whatshap mappings')
        whatshap_sort_fn, whatshap_p = post_process_whatshap(
            args.output_directory, aligner.out_fmt, aligner.ref_fn)

    if mh.SNP_NAME in args.outputs or mh.MOD_NAME in args.outputs:
        post_process_aggregate(
            mods_info, args.outputs, args.mod_binary_threshold,
            args.output_directory, args.processes, args.write_vcf_log_probs,
            args.heterozygous_factors, snps_data, args.write_mod_log_probs,
            args.suppress_progress, aligner.ref_names_and_lens)

    if mh.SNP_NAME in args.outputs:
        logger.info('Sorting output variant file')
        variant_fn = mh.get_megalodon_fn(args.output_directory, mh.SNP_NAME)
        sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted')
        snps.sort_variants(variant_fn, sort_variant_fn)
        logger.info('Indexing output variant file')
        index_variant_fn = snps.index_variants(sort_variant_fn)

    if mh.WHATSHAP_MAP_NAME in args.outputs:
        if whatshap_p.is_alive():
            logger.info('Waiting for whatshap mappings sort')
            while whatshap_p.is_alive():
                sleep(0.1)
        logger.info(
            snps.get_whatshap_command(index_variant_fn, whatshap_sort_fn,
                                      mh.add_fn_suffix(variant_fn, 'phased')))

    if mh.MAP_NAME in args.outputs:
        if map_p.is_alive():
            logger.info('Waiting for mappings sort')
            while map_p.is_alive():
                sleep(0.1)

    return
Esempio n. 17
0
def aggregate_stats(outputs,
                    out_dir,
                    num_ps,
                    write_vcf_lp,
                    het_factors,
                    call_mode,
                    mod_names,
                    mod_agg_info,
                    write_mod_lp,
                    mod_output_fmts,
                    suppress_progress,
                    ref_names_and_lens,
                    valid_read_ids=None,
                    out_suffix=None):
    if mh.SNP_NAME in outputs and mh.MOD_NAME in outputs:
        num_ps = max(num_ps // 2, 1)

    logger = logging.get_logger('agg')
    num_snps, num_mods, snp_prog_q, mod_prog_q = (0, 0, queue.Queue(),
                                                  queue.Queue())
    if mh.SNP_NAME in outputs:
        snps_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME)
        logger.info('Computing number of unique variants.')
        num_snps = snps.AggSnps(snps_db_fn).num_uniq()
        logger.info('Spawning variant aggregation processes.')
        # create process to collect snp stats from workers
        snp_stats_q, snp_stats_p, main_snp_stats_conn = mh.create_getter_q(
            _get_snp_stats_queue,
            (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp))
        # create process to fill snp locs queue
        snp_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        snp_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(snp_filler_q, snps_db_fn, snps.AggSnps,
                                        num_ps),
                                  daemon=True)
        snp_filler_p.start()
        # create worker processes to aggregate snps
        snp_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_snps_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_snps_worker,
                           args=(snp_filler_q, snp_stats_q, snp_prog_q,
                                 snps_db_fn, write_vcf_lp, het_factors,
                                 call_mode, valid_read_ids),
                           daemon=True)
            p.start()
            agg_snps_ps.append(p)

    if mh.MOD_NAME in outputs:
        mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME)
        num_mods = mods.AggMods(mods_db_fn).num_uniq()
        logger.info('Spawning modified base aggregation processes.')
        # create process to collect mods stats from workers
        mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q(
            _get_mod_stats_queue, (out_dir, mod_names, ref_names_and_lens,
                                   out_suffix, write_mod_lp, mod_output_fmts))
        # create process to fill mod locs queue
        mod_filler_q = mp.Queue(maxsize=100000)
        mod_fill_limit = _N_MOD_PROF if _DO_PROF else None
        mod_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(mod_filler_q, mods_db_fn, mods.AggMods,
                                        num_ps, mod_fill_limit),
                                  daemon=True)
        mod_filler_p.start()
        # create worker processes to aggregate mods
        mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_mods_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_mods_worker,
                           args=(mod_filler_q, mod_stats_q, mod_prog_q,
                                 mods_db_fn, mod_agg_info, valid_read_ids,
                                 write_mod_lp),
                           daemon=True)
            p.start()
            agg_mods_ps.append(p)

    # create progress process
    logger.info('Aggregating {} SNPs and {} mod sites over reads.'.format(
        num_snps, num_mods))
    main_prog_conn, prog_conn = mp.Pipe()
    prog_p = mp.Process(target=_agg_prog_worker,
                        args=(snp_prog_q, mod_prog_q, num_snps, num_mods,
                              prog_conn, suppress_progress),
                        daemon=True)
    prog_p.start()

    # join filler processes first
    if mh.SNP_NAME in outputs:
        snp_filler_p.join()
        for agg_snps_p in agg_snps_ps:
            agg_snps_p.join()
        # send to conn
        if snp_stats_p.is_alive():
            main_snp_stats_conn.send(True)
        snp_stats_p.join()
    if mh.MOD_NAME in outputs:
        for agg_mods_p in agg_mods_ps:
            agg_mods_p.join()
        if mod_stats_p.is_alive():
            main_mod_stats_conn.send(True)
        mod_stats_p.join()
    if prog_p.is_alive():
        main_prog_conn.send(True)
        prog_p.join()

    return
Esempio n. 18
0
def call_read_snps(snps_data, r_ref_pos, edge_buffer, r_ref_seq, rl_cumsum,
                   r_to_q_poss, r_post, post_mapped_start):
    # call all snps overlapping this read
    r_snp_calls = []
    for (snp_ref_seq, snp_alt_seqs, snp_id,
         snp_ref_pos) in snps_data.iter_overlapping_snps(
             r_ref_pos, edge_buffer):

        if r_ref_pos.strand == 1:
            read_pos = snp_ref_pos - r_ref_pos.start
            read_ref_seq = snp_ref_seq
            read_alt_seqs = snp_alt_seqs
        else:
            read_pos = r_ref_pos.end - snp_ref_pos - len(snp_ref_seq)
            read_ref_seq = mh.revcomp(snp_ref_seq)
            read_alt_seqs = [mh.revcomp(alt_seq) for alt_seq in snp_alt_seqs]

        # select single base SNP or indel context width
        snp_context_bases = snps_data.indel_context if all(
            len(snp_ref_seq) == len(snp_alt_seq)
            for snp_alt_seq in snp_alt_seqs) else snps_data.snp_context
        pos_bb = min(snp_context_bases, read_pos)
        pos_ab = min(snp_context_bases,
                     r_ref_seq.shape[0] - read_pos - len(read_ref_seq))
        pos_ref_seq = r_ref_seq[read_pos - pos_bb:read_pos + pos_ab +
                                len(read_ref_seq)]
        # TODO move this to an initial check of a small number of variants
        # against the reference
        if any(pos_ref_seq[pos_bb:pos_bb + len(snp_ref_seq)] != np.array(
            [mh.ALPHABET.find(b) for b in read_ref_seq])):
            # variant reference sequence does not match fasta reference
            logger = logging.get_logger()
            logger.debug(
                '*' * 10 +
                'Refernce seq at {} expected {}[{}]{} got "{}"'.format(
                    snp_ref_pos,
                    ''.join(mh.ALPHABET[b]
                            for b in pos_ref_seq[pos_bb - 3:pos_bb]),
                    ''.join(mh.ALPHABET[b]
                            for b in pos_ref_seq[pos_bb:pos_bb +
                                                 len(snp_ref_seq)]),
                    ''.join(
                        mh.ALPHABET[b]
                        for b in pos_ref_seq[pos_bb + len(snp_ref_seq):pos_bb +
                                             len(snp_ref_seq) + 3]),
                    read_ref_seq,
                ) + '*' * 10)
            continue
        blk_start = rl_cumsum[r_to_q_poss[read_pos - pos_bb]]
        blk_end = rl_cumsum[r_to_q_poss[read_pos + pos_ab] + 1]
        if blk_end - blk_start < max(
                len(pos_ref_seq),
                max(len(read_alt_seq) for read_alt_seq in read_alt_seqs)):
            # no valid mapping over large inserted query bases
            # i.e. need as many "events/strides" as bases for valid mapping
            continue

        loc_ref_score = score_seq(r_post, pos_ref_seq,
                                  post_mapped_start + blk_start,
                                  post_mapped_start + blk_end,
                                  snps_data.all_paths)
        loc_alt_llrs = []
        for read_alt_seq in read_alt_seqs:
            pos_alt_seq = np.concatenate([
                pos_ref_seq[:pos_bb],
                np.array([mh.ALPHABET.find(b) for b in read_alt_seq],
                         dtype=np.uintp),
                pos_ref_seq[pos_bb + len(snp_ref_seq):]
            ])
            loc_alt_score = score_seq(r_post, pos_alt_seq,
                                      post_mapped_start + blk_start,
                                      post_mapped_start + blk_end,
                                      snps_data.all_paths)
            # calibrate log probs
            loc_alt_llrs.append(
                snps_data.calibrate_llr(loc_ref_score - loc_alt_score,
                                        read_ref_seq, read_alt_seq))

        # due to calibration mutli-allelic log likelihoods could result in
        # inferred negative reference likelihood, so re-normalize here
        loc_alt_log_ps = calibration.compute_log_probs(np.array(loc_alt_llrs))

        r_snp_calls.append(
            (snp_ref_pos, loc_alt_log_ps, snp_ref_seq, snp_alt_seqs, snp_id))

    return r_snp_calls
Esempio n. 19
0
def _agg_prog_worker(snp_prog_q, mod_prog_q, num_snps, num_mods, prog_conn,
                     suppress_progress):
    snp_bar, mod_bar = None, None
    if num_snps > 0:
        if num_mods > 0 and not suppress_progress:
            mod_bar = tqdm(desc='Mods',
                           unit=' sites',
                           total=num_mods,
                           position=1,
                           smoothing=0,
                           dynamic_ncols=True)
            snp_bar = tqdm(desc='SNPs',
                           unit=' sites',
                           total=num_snps,
                           position=0,
                           smoothing=0,
                           dynamic_ncols=True)
        elif not suppress_progress:
            snp_bar = tqdm(desc='SNPs',
                           unit=' sites',
                           total=num_snps,
                           position=0,
                           smoothing=0,
                           dynamic_ncols=True)
    elif num_mods > 0 and not suppress_progress:
        mod_bar = tqdm(desc='Mods',
                       unit=' sites',
                       total=num_mods,
                       position=0,
                       smoothing=0,
                       dynamic_ncols=True)

    logger = logging.get_logger()
    while True:
        try:
            snp_prog_q.get(block=False)
            if not suppress_progress:
                if snp_bar is not None: snp_bar.update(1)
                if mod_bar is not None: mod_bar.update(0)
        except queue.Empty:
            try:
                mod_prog_q.get(block=False)
                if not suppress_progress:
                    if snp_bar is not None: snp_bar.update(0)
                    if mod_bar is not None: mod_bar.update(1)
            except queue.Empty:
                sleep(0.001)
                if prog_conn.poll():
                    break
                continue

    while not snp_prog_q.empty():
        snp_prog_q.get(block=False)
        if not suppress_progress: snp_bar.update(1)
    while not mod_prog_q.empty():
        mod_prog_q.get(block=False)
        if not suppress_progress: mod_bar.update(1)
    if snp_bar is not None:
        snp_bar.close()
    if mod_bar is not None:
        mod_bar.close()
    if num_mods > 0 and num_snps > 0 and not suppress_progress:
        sys.stderr.write('\n\n')

    return
Esempio n. 20
0
def process_all_reads(fast5s_dir, recursive, num_reads, read_ids_fn,
                      model_info, outputs, out_dir, bc_fmt, aligner, snps_data,
                      num_ps, num_update_errors, suppress_progress, mods_info,
                      db_safety, edge_buffer, pr_ref_filts):
    logger = logging.get_logger()
    logger.info('Preparing workers to process reads.')
    # read filename queue filler
    # Note no maxsize for this queue to compute total number of reads while
    # also not delaying read processing
    read_file_q = mp.Queue()
    num_reads_conn, getter_num_reads_conn = mp.Pipe()
    files_p = mp.Process(target=_fill_files_queue,
                         args=(read_file_q, fast5s_dir, num_reads, read_ids_fn,
                               recursive, num_ps, num_reads_conn),
                         daemon=True)
    files_p.start()
    # progress and failed reads getter (no limit on failed reads queue
    # in case error occurs there, don't halt run
    failed_reads_q, f_p, main_f_conn = mh.create_getter_q(
        _get_fail_queue,
        (getter_num_reads_conn, num_update_errors, suppress_progress),
        max_size=None)

    # start output type getters/writers
    (bc_q, bc_p, main_bc_conn, mo_q, mo_p, main_mo_conn, snps_q, snps_p,
     main_snps_conn, mods_q, mods_p, main_mods_conn) = [
         None,
     ] * 12
    if mh.BC_NAME in outputs or mh.BC_MODS_NAME in outputs:
        if mh.BC_NAME not in outputs:
            outputs.append(mh.BC_NAME)
        bc_q, bc_p, main_bc_conn = mh.create_getter_q(
            _get_bc_queue, (out_dir, bc_fmt, mods_info.do_output_mods,
                            mods_info.mod_long_names))
    if mh.MAP_NAME in outputs:
        do_output_pr_refs = (mh.PR_REF_NAME in outputs
                             and not mods_info.do_pr_ref_mods
                             and not snps_data.do_pr_ref_snps)
        mo_q, mo_p, main_mo_conn = mh.create_getter_q(
            mapping._get_map_queue,
            (out_dir, aligner.ref_names_and_lens, aligner.out_fmt,
             aligner.ref_fn, do_output_pr_refs, pr_ref_filts))
    if mh.PR_SNP_NAME in outputs:
        pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if (
            mh.PR_REF_NAME in outputs and snps_data.do_pr_ref_snps) else None
        whatshap_map_fn = (
            mh.get_megalodon_fn(out_dir, mh.WHATSHAP_MAP_NAME) + '.' +
            aligner.out_fmt) if mh.WHATSHAP_MAP_NAME in outputs else None
        snps_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_SNP_TXT_NAME)
                       if snps_data.write_snps_txt else None)
        snps_q, snps_p, main_snps_conn = mh.create_getter_q(
            snps._get_snps_queue,
            (mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME), snps_txt_fn,
             db_safety, pr_refs_fn, pr_ref_filts, whatshap_map_fn,
             aligner.ref_names_and_lens, aligner.ref_fn))
    if mh.PR_MOD_NAME in outputs:
        pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if (
            mh.PR_REF_NAME in outputs and mods_info.do_pr_ref_mods) else None
        mods_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_MOD_TXT_NAME)
                       if mods_info.write_mods_txt else None)
        mods_q, mods_p, main_mods_conn = mh.create_getter_q(
            mods._get_mods_queue,
            (mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME), mods_txt_fn,
             db_safety, pr_refs_fn, pr_ref_filts))

    proc_reads_ps, map_conns = [], []
    for device in model_info.process_devices:
        if aligner is None:
            map_conn, caller_conn = None, None
        else:
            map_conn, caller_conn = mp.Pipe()
        map_conns.append(map_conn)
        p = mp.Process(target=_process_reads_worker,
                       args=(read_file_q, bc_q, snps_q, failed_reads_q, mods_q,
                             caller_conn, model_info, snps_data, mods_info,
                             edge_buffer, device))
        p.daemon = True
        p.start()
        proc_reads_ps.append(p)
    sleep(0.1)

    # perform mapping in threads for mappy shared memory interface
    # open threads after all processes have started due to python
    # multiprocess combined with threading instability
    if aligner is None:
        map_read_ts = None
    else:
        map_read_ts = []
        for map_conn in map_conns:
            t = threading.Thread(target=mapping._map_read_worker,
                                 args=(aligner, map_conn, mo_q))
            t.daemon = True
            t.start()
            map_read_ts.append(t)

    try:
        files_p.join()
        for proc_reads_p in proc_reads_ps:
            proc_reads_p.join()
        if map_read_ts is not None:
            for map_t in map_read_ts:
                map_t.join()
        # comm to getter processes to return
        if f_p.is_alive():
            main_f_conn.send(True)
            f_p.join()
        for on, p, main_conn in ((mh.BC_NAME, bc_p, main_bc_conn),
                                 (mh.MAP_NAME, mo_p, main_mo_conn),
                                 (mh.PR_SNP_NAME, snps_p, main_snps_conn),
                                 (mh.PR_MOD_NAME, mods_p, main_mods_conn)):
            if on in outputs and p.is_alive():
                main_conn.send(True)
                if on == mh.PR_SNP_NAME:
                    logger.info(
                        'Waiting for snps database to complete indexing.')
                elif on == mh.PR_MOD_NAME:
                    logger.info(
                        'Waiting for mods database to complete indexing.')
                p.join()
    except KeyboardInterrupt:
        logger.error('Exiting due to keyboard interrupt.')
        sys.exit(1)

    return
Esempio n. 21
0
    def __init__(self,
                 model_info,
                 all_mod_motifs_raw=None,
                 mod_all_paths=False,
                 write_mods_txt=None,
                 mod_context_bases=None,
                 do_output_mods=False,
                 do_pr_ref_mods=False,
                 mods_calib_fn=None,
                 mod_output_fmts=[mh.MOD_BEDMETHYL_NAME]):
        logger = logging.get_logger()
        # this is pretty hacky, but these attributes are stored here as
        # they are generally needed alongside other alphabet info
        # don't want to pass all of these parameters around individually though
        # as this would make function signatures too complicated
        self.mod_all_paths = mod_all_paths
        self.write_mods_txt = write_mods_txt
        self.mod_context_bases = mod_context_bases
        self.do_output_mods = do_output_mods
        self.do_pr_ref_mods = do_pr_ref_mods
        self.mod_long_names = model_info.mod_long_names
        self.calib_table = calibration.ModCalibrator(mods_calib_fn)
        self.mod_output_fmts = mod_output_fmts

        self.alphabet = model_info.can_alphabet
        self.ncan_base = len(self.alphabet)
        try:
            self.alphabet = self.alphabet.decode()
        except:
            pass
        if model_info.is_cat_mod:
            # TODO also output "(alt to C)" for each mod
            logger.info(
                'Using canoncical alphabet {} and modified bases {}.'.format(
                    self.alphabet,
                    ' '.join('{}={}'.format(*mod_b)
                             for mod_b in model_info.mod_long_names)))
        else:
            logger.info('Using canoncical alphabet {}.'.format(self.alphabet))

        self.nbase = len(self.alphabet)
        self.n_can_state = (self.ncan_base +
                            self.ncan_base) * (self.ncan_base + 1)
        if model_info.is_cat_mod:
            self.nmod_base = model_info.n_mods
            self.can_base_mods = model_info.can_base_mods
            self.can_mods_offsets = model_info.can_indices
            self.str_to_int_mod_labels = model_info.str_to_int_mod_labels
            assert (
                model_info.output_size - self.n_can_state == self.nmod_base +
                1), ('Alphabet ({}) and model number of modified bases ({}) ' +
                     'do not agree.').format(
                         self.alphabet,
                         model_info.output_size - self.n_can_state - 1)
        else:
            self.nmod_base = 0
            self.can_base_mods = {}
            self.can_mods_offsets = None
            self.str_to_int_mod_labels = None

        # parse mod motifs or use "swap" base if no motif provided
        self._parse_mod_motifs(all_mod_motifs_raw)

        return
Esempio n. 22
0
    def _load_taiyaki_model(self):
        if any(arg is None for arg in (self.chunk_size, self.chunk_overlap,
                                       self.max_concur_chunks)):
            logger = logging.get_logger()
            logger.debug('Must provide chunk_size, chunk_overlap, ' +
                         'max_concur_chunks in order to run the taiyaki ' +
                         'base calling backend.')
        self.model_type = TAI_NAME

        if self.devices is None:
            self.devices = [
                'cpu',
            ]
        base_proc_per_device = int(np.ceil(self.num_proc / len(self.devices)))
        procs_per_device = np.repeat(base_proc_per_device, len(self.devices))
        if base_proc_per_device * len(self.devices) > self.num_proc:
            procs_per_device[-(base_proc_per_device * len(self.devices) -
                               self.num_proc):] -= 1
        assert sum(procs_per_device) == self.num_proc
        self.process_devices = [
            dv for dv, n_dv in zip(self.devices, procs_per_device)
            for _ in range(n_dv)
        ]

        try:
            # import modules
            from taiyaki.helpers import load_model as load_taiyaki_model
            from taiyaki.basecall_helpers import run_model as tai_run_model
            from taiyaki.layers import GlobalNormFlipFlopCatMod
            import torch
        except ImportError:
            logger.error(
                'Failed to import taiyaki and pytorch. Ensure working ' +
                'installations to run megalodon')
            sys.exit(1)

        # store modules in object
        self.load_taiyaki_model = load_taiyaki_model
        self.tai_run_model = tai_run_model
        self.torch = torch

        tmp_model = self.load_taiyaki_model(self.taiyaki_model_fn)
        ff_layer = tmp_model.sublayers[-1]
        self.is_cat_mod = (GlobalNormFlipFlopCatMod is not None
                           and isinstance(ff_layer, GlobalNormFlipFlopCatMod))
        self.output_size = ff_layer.size
        if self.is_cat_mod:
            # Modified base model is defined by 3 fixed fields in taiyaki
            # can_nmods, output_alphabet and modified_base_long_names
            self.output_alphabet = ff_layer.output_alphabet
            self.can_nmods = ff_layer.can_nmods
            self.ordered_mod_long_names = ff_layer.ordered_mod_long_names

            # parse these values to more user-friendly data structures
            self.can_alphabet = ''
            self.can_indices = []
            self.mod_long_names = []
            self.str_to_int_mod_labels = {}
            self.can_base_mods = defaultdict(list)
            curr_can_offset = 0
            curr_nmods = 0
            for can_base_nmods in self.can_nmods:
                can_base = self.output_alphabet[curr_can_offset]
                self.can_alphabet += can_base
                self.can_indices.append(curr_can_offset)
                for mod_i, mod_base in enumerate(
                        self.output_alphabet[curr_can_offset +
                                             1:curr_can_offset +
                                             can_base_nmods + 1]):
                    self.mod_long_names.append(
                        (mod_base,
                         self.ordered_mod_long_names[curr_nmods + mod_i]))
                    self.str_to_int_mod_labels[mod_base] = mod_i + 1
                    self.can_base_mods[can_base].append(mod_base)

                curr_can_offset += can_base_nmods + 1
                curr_nmods += can_base_nmods

            self.can_indices.append(curr_can_offset)
            self.can_indices = np.array(self.can_indices).astype(np.uintp)
            self.can_base_mods = dict(self.can_base_mods)
        else:
            if mh.nstate_to_nbase(ff_layer.size) != 4:
                raise NotImplementedError(
                    'Naive modified base flip-flop models are not ' +
                    'supported.')
            self.output_alphabet = mh.ALPHABET
            self.can_alphabet = mh.ALPHABET
            self.mod_long_names = []
            self.str_to_int_mod_labels = {}
        self.n_mods = len(self.mod_long_names)

        return
Esempio n. 23
0
def _get_mods_queue(mods_q, mods_conn, mods_db_fn, db_safety,
                    ref_names_and_lens, mods_txt_fn, pr_refs_fn, pr_ref_filts):
    def get_mod_call(been_warned):
        # note strand is +1 for fwd or -1 for rev
        r_mod_scores, (read_id, chrm, strand, r_start, ref_seq, read_len, q_st,
                       q_en, cigar) = mods_q.get(block=False)
        try:
            mods_db.insert_read_scores(r_mod_scores, read_id, chrm, strand)
        except Exception as e:
            if not been_warned:
                logger.warning(
                    'Error inserting modified base scores into database. See '
                    + 'log debyg output for error details.')
                been_warned = True
            import traceback
            var = traceback.format_exc()
            logger.debug(
                'Error inserting modified base scores into database: ' +
                str(e) + '\n' + var)

        if mods_txt_fp is not None and len(r_mod_scores) > 0:
            # would involve batching and creating several conversion tables
            # for var strings (read_if and chrms).
            mods_txt_fp.write('\n'.join((
                ('\t'.join('{}' for _ in field_names)
                 ).format(read_id, chrm, strand, pos, mod_lp,
                          np.log1p(-np.exp(mod_lps).sum()), mod_base,
                          '{}:{}'.format(raw_motif, rel_pos)) for pos, mod_lps,
                mod_bases, ref_motif, rel_pos, raw_motif in r_mod_scores
                for mod_lp, mod_base in zip(mod_lps, mod_bases))) + '\n')
        if pr_refs_fn is not None:
            if not mapping.read_passes_filters(pr_ref_filts, read_len, q_st,
                                               q_en, cigar):
                return

            pr_refs_fp.write('>{}\n{}\n'.format(
                read_id, annotate_mods(r_start, ref_seq, r_mod_scores,
                                       strand)))

        return been_warned

    logger = logging.get_logger('mods')
    been_warned = False

    mods_db = ModsDb(mods_db_fn,
                     db_safety=db_safety,
                     read_only=False,
                     pos_index_in_memory=True)
    for ref_name in ref_names_and_lens[0]:
        mods_db.insert_chrm(ref_name)
    mods_db.create_chrm_index()

    if mods_txt_fn is None:
        mods_txt_fp = None
    else:
        mods_txt_fp = open(mods_txt_fn, 'w')
        field_names = ('read_id', 'chrm', 'strand', 'pos', 'mod_log_prob',
                       'can_log_prob', 'mod_base', 'motif')
        mods_txt_fp.write('\t'.join(field_names) + '\n')

    if pr_refs_fn is not None:
        pr_refs_fp = open(pr_refs_fn, 'w')

    while True:
        try:
            been_warned = get_mod_call(been_warned)
        except queue.Empty:
            if mods_conn.poll():
                break
            sleep(0.001)
            continue
    while not mods_q.empty():
        been_warned = get_mod_call(been_warned)

    if mods_txt_fp is not None: mods_txt_fp.close()
    if pr_refs_fn is not None: pr_refs_fp.close()
    mods_db.create_mod_index()
    mods_db.create_data_covering_index()
    mods_db.close()

    return