Exemple #1
0
def _agg_mods_worker(pos_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info,
                     valid_read_ids, write_mod_lp):
    # functions for profiling purposes
    def get_pos_data():
        return pos_q.get(block=False)

    def put_mod_site(mod_site):
        mod_stats_q.put(mod_site)
        return

    def do_sleep():
        sleep(0.0001)
        return

    agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp)

    while True:
        try:
            pos_data = get_pos_data()
        except queue.Empty:
            do_sleep()
            continue
        if pos_data is None:
            break

        try:
            mod_site = agg_mods.compute_mod_stats(
                pos_data, valid_read_ids=valid_read_ids)
            put_mod_site(mod_site)
        except mh.MegaError:
            # no valid reads cover location
            pass
        mod_prog_q.put(1)

    return
Exemple #2
0
def _agg_mods_worker(pos_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info,
                     valid_read_ids, write_mod_lp):
    # functions for profiling purposes
    def get_pos_data():
        return pos_q.get(block=True, timeout=0.01)

    def put_mod_site(mod_site):
        mod_stats_q.put(mod_site)

    agg_mods = mods.AggMods(mods_db_fn,
                            mod_agg_info,
                            write_mod_lp,
                            load_uuid_index_in_memory=valid_read_ids
                            is not None)

    while True:
        try:
            pos_data = get_pos_data()
        except queue.Empty:
            continue
        if pos_data is None:
            break

        try:
            mod_site = agg_mods.compute_mod_stats(
                pos_data, valid_read_ids=valid_read_ids)
            put_mod_site(mod_site)
        except mh.MegaError:
            # no valid reads cover location
            pass
        mod_prog_q.put(1)
Exemple #3
0
def _agg_mods_worker(locs_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info,
                     valid_read_ids, write_mod_lp):
    def get_pos_id():
        return locs_q.get(block=False)

    def get_loc_data_from_id(q_pos_id):
        # function for profiling purposes
        if q_pos_id is None: return None
        for pos_id, chrm_id, strand, pos in locs_iter:
            if q_pos_id == pos_id:
                return pos_id, chrm_id, strand, pos

    def get_loc_data():
        return locs_q.get(block=False)

    def put_mod_site(mod_site):
        # function for profiling purposes
        mod_stats_q.put(mod_site)
        return

    def do_sleep():
        # function for profiling purposes
        sleep(0.0001)
        return

    # needed if only pos id is loaded into queue
    #locs_iter = mods.ModsDb(mods_db_fn).iter_pos_ordered()
    agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp)

    while True:
        try:
            #loc_data = get_loc_data_from_id(get_pos_id())
            loc_data = get_loc_data()
        except queue.Empty:
            do_sleep()
            continue
        if loc_data is None:
            break

        try:
            mod_site = agg_mods.compute_mod_stats(
                loc_data, valid_read_ids=valid_read_ids)
            put_mod_site(mod_site)
        except mh.MegaError:
            # no valid reads cover location
            pass
        mod_prog_q.put(1)

    return
Exemple #4
0
def _agg_mods_worker(
    pos_q,
    mod_stats_q,
    mod_prog_q,
    mods_db_fn,
    mod_agg_info,
    valid_read_dbids,
    write_mod_lp,
):
    # functions for profiling purposes
    def get_pos_data():
        return pos_q.get(block=True, timeout=0.01)

    def put_mod_site(mod_site):
        mod_stats_q.put(mod_site)

    agg_mods = mods.AggMods(
        mods_db_fn,
        mod_agg_info,
        write_mod_lp,
        load_uuid_index_in_memory=valid_read_dbids is not None,
    )

    while True:
        try:
            pos_data_batch = get_pos_data()
        except queue.Empty:
            continue
        if pos_data_batch is None:
            break

        mod_sites_batch = []
        batch_size = 0
        for pos_data in pos_data_batch:
            try:
                mod_site = agg_mods.compute_mod_stats(
                    pos_data, valid_read_dbids=valid_read_dbids)
                mod_sites_batch.append(mod_site)
            except mh.MegaError:
                # no valid reads cover location
                pass
            batch_size += len(pos_data[1])
        mod_prog_q.put(batch_size)
        put_mod_site(mod_sites_batch)
Exemple #5
0
def _agg_mods_worker(locs_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info,
                     valid_read_ids, write_mod_lp):
    agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp)

    while True:
        try:
            mod_loc = locs_q.get(block=False)
        except queue.Empty:
            sleep(0.1)
            continue
        if mod_loc is None:
            break

        try:
            mod_site = agg_mods.compute_mod_stats(
                mod_loc, valid_read_ids=valid_read_ids)
            mod_stats_q.put(mod_site)
        except mh.MegaError:
            # no valid reads cover location
            pass
        mod_prog_q.put(1)

    return
Exemple #6
0
def aggregate_stats(outputs,
                    out_dir,
                    num_ps,
                    write_vcf_lp,
                    het_factors,
                    call_mode,
                    mod_names,
                    mod_agg_info,
                    write_mod_lp,
                    mod_output_fmts,
                    suppress_progress,
                    ref_names_and_lens,
                    valid_read_ids=None,
                    out_suffix=None):
    if mh.SNP_NAME in outputs and mh.MOD_NAME in outputs:
        num_ps = max(num_ps // 2, 1)

    logger = logging.get_logger('agg')
    num_snps, num_mods, snp_prog_q, mod_prog_q = (0, 0, queue.Queue(),
                                                  queue.Queue())
    if mh.SNP_NAME in outputs:
        snps_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME)
        logger.info('Computing number of unique variants.')
        num_snps = snps.AggSnps(snps_db_fn).num_uniq()
        logger.info('Spawning variant aggregation processes.')
        # create process to collect snp stats from workers
        snp_stats_q, snp_stats_p, main_snp_stats_conn = mh.create_getter_q(
            _get_snp_stats_queue,
            (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp))
        # create process to fill snp locs queue
        snp_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        snp_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(snp_filler_q, snps_db_fn, snps.AggSnps,
                                        num_ps),
                                  daemon=True)
        snp_filler_p.start()
        # create worker processes to aggregate snps
        snp_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_snps_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_snps_worker,
                           args=(snp_filler_q, snp_stats_q, snp_prog_q,
                                 snps_db_fn, write_vcf_lp, het_factors,
                                 call_mode, valid_read_ids),
                           daemon=True)
            p.start()
            agg_snps_ps.append(p)

    if mh.MOD_NAME in outputs:
        mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME)
        num_mods = mods.AggMods(mods_db_fn).num_uniq()
        logger.info('Spawning modified base aggregation processes.')
        # create process to collect mods stats from workers
        mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q(
            _get_mod_stats_queue, (out_dir, mod_names, ref_names_and_lens,
                                   out_suffix, write_mod_lp, mod_output_fmts))
        # create process to fill mod locs queue
        mod_filler_q = mp.Queue(maxsize=100000)
        mod_fill_limit = _N_MOD_PROF if _DO_PROF else None
        mod_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(mod_filler_q, mods_db_fn, mods.AggMods,
                                        num_ps, mod_fill_limit),
                                  daemon=True)
        mod_filler_p.start()
        # create worker processes to aggregate mods
        mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_mods_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_mods_worker,
                           args=(mod_filler_q, mod_stats_q, mod_prog_q,
                                 mods_db_fn, mod_agg_info, valid_read_ids,
                                 write_mod_lp),
                           daemon=True)
            p.start()
            agg_mods_ps.append(p)

    # create progress process
    logger.info('Aggregating {} SNPs and {} mod sites over reads.'.format(
        num_snps, num_mods))
    main_prog_conn, prog_conn = mp.Pipe()
    prog_p = mp.Process(target=_agg_prog_worker,
                        args=(snp_prog_q, mod_prog_q, num_snps, num_mods,
                              prog_conn, suppress_progress),
                        daemon=True)
    prog_p.start()

    # join filler processes first
    if mh.SNP_NAME in outputs:
        snp_filler_p.join()
        for agg_snps_p in agg_snps_ps:
            agg_snps_p.join()
        # send to conn
        if snp_stats_p.is_alive():
            main_snp_stats_conn.send(True)
        snp_stats_p.join()
    if mh.MOD_NAME in outputs:
        for agg_mods_p in agg_mods_ps:
            agg_mods_p.join()
        if mod_stats_p.is_alive():
            main_mod_stats_conn.send(True)
        mod_stats_p.join()
    if prog_p.is_alive():
        main_prog_conn.send(True)
        prog_p.join()

    return
Exemple #7
0
def aggregate_stats(outputs,
                    out_dir,
                    num_ps,
                    write_vcf_lp,
                    het_factors,
                    call_mode,
                    mod_agg_info,
                    write_mod_lp,
                    mod_output_fmts,
                    suppress_progress,
                    valid_read_ids=None,
                    out_suffix=None,
                    batch_size=mh.DEFAULT_AGG_BATCH_SIZE):
    if mh.VAR_NAME in outputs and mh.MOD_NAME in outputs:
        num_ps = max(num_ps // 2, 1)

    num_vars, num_mods, var_prog_q, mod_prog_q = (0, 0, queue.Queue(),
                                                  queue.Queue())
    if mh.VAR_NAME in outputs:
        vars_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_VAR_NAME)
        agg_vars = variants.AggVars(vars_db_fn, no_indices_in_mem=True)
        num_vars = agg_vars.num_uniq()
        ref_names_and_lens = agg_vars.vars_db.get_all_chrm_and_lens()
        agg_vars.close()
        LOGGER.info('Spawning variant aggregation processes')
        # create process to collect var stats from workers
        var_stats_q, var_stats_p, m_var_stats_conn = mega_mp.create_getter_qpc(
            _get_var_stats_queue,
            (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp))
        # create process to fill variant locs queue
        var_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        var_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(var_filler_q, vars_db_fn,
                                        variants.AggVars, num_ps, batch_size),
                                  daemon=True)
        var_filler_p.start()
        # create worker processes to aggregate variants
        var_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_vars_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_vars_worker,
                           args=(var_filler_q, var_stats_q, var_prog_q,
                                 vars_db_fn, write_vcf_lp, het_factors,
                                 call_mode, valid_read_ids),
                           daemon=True)
            p.start()
            agg_vars_ps.append(p)

    if mh.MOD_NAME in outputs:
        mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME)
        valid_read_dbids = None
        if valid_read_ids is not None:
            mods_db = mods.ModsDb(mods_db_fn, in_mem_uuid_to_dbid=True)
            valid_read_dbids = set()
            for read_id in valid_read_ids:
                valid_read_dbids.add(mods_db.get_read_dbid(read_id))
        agg_mods = mods.AggMods(mods_db_fn)
        mod_long_names = agg_mods.get_mod_long_names()
        num_mods = agg_mods.num_uniq()
        ref_names_and_lens = agg_mods.mods_db.get_all_chrm_and_lens()
        agg_mods.close()
        LOGGER.info('Spawning modified base aggregation processes')
        # create process to collect mods stats from workers
        mod_stats_q, mod_stats_p, m_mod_stats_conn = mega_mp.create_getter_qpc(
            _get_mod_stats_queue, (out_dir, mod_long_names, ref_names_and_lens,
                                   out_suffix, write_mod_lp, mod_output_fmts))
        # create process to fill mod locs queue
        mod_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        mod_fill_limit = _N_MOD_PROF if _DO_PROF else None
        mod_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(mod_filler_q, mods_db_fn, mods.AggMods,
                                        num_ps, batch_size, mod_fill_limit),
                                  daemon=True)
        mod_filler_p.start()
        # create worker processes to aggregate mods
        mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_mods_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_mods_worker,
                           args=(mod_filler_q, mod_stats_q, mod_prog_q,
                                 mods_db_fn, mod_agg_info, valid_read_dbids,
                                 write_mod_lp),
                           daemon=True)
            p.start()
            agg_mods_ps.append(p)

    if num_vars == 0 and num_mods == 0:
        LOGGER.warning('No per-read variants or modified base statistics ' +
                       'found for aggregation.')
        return
    if num_vars == 0:
        LOGGER.info('Aggregating {} per-read modified base statistics'.format(
            num_mods))
    elif num_mods == 0:
        LOGGER.info('Aggregating {} variants'.format(num_vars))
    else:
        LOGGER.info(('Aggregating {} variants and {} per-read modified base ' +
                     'statistics').format(num_vars, num_mods))
    LOGGER.info(
        'NOTE: If this step is very slow, ensure the output directory is ' +
        'located on a fast read disk (e.g. local SSD). Aggregation can be ' +
        'restarted using the `megalodon_extras aggregate run` command')

    # create progress process
    main_prog_conn, prog_conn = mp.Pipe()
    prog_p = mp.Process(target=_agg_prog_worker,
                        args=(var_prog_q, mod_prog_q, num_vars, num_mods,
                              prog_conn, suppress_progress),
                        daemon=True)
    prog_p.start()

    # join filler processes first
    if mh.VAR_NAME in outputs:
        var_filler_p.join()
        for agg_vars_p in agg_vars_ps:
            agg_vars_p.join()
        # send to conn
        if var_stats_p.is_alive():
            m_var_stats_conn.send(True)
        var_stats_p.join()
    if mh.MOD_NAME in outputs:
        for agg_mods_p in agg_mods_ps:
            agg_mods_p.join()
        if mod_stats_p.is_alive():
            m_mod_stats_conn.send(True)
        mod_stats_p.join()
    if prog_p.is_alive():
        main_prog_conn.send(True)
        prog_p.join()
Exemple #8
0
def aggregate_stats(outputs,
                    out_dir,
                    num_ps,
                    write_vcf_lp,
                    het_factors,
                    call_mode,
                    mod_agg_info,
                    write_mod_lp,
                    mod_output_fmts,
                    suppress_progress,
                    valid_read_ids=None,
                    out_suffix=None):
    if mh.VAR_NAME in outputs and mh.MOD_NAME in outputs:
        num_ps = max(num_ps // 2, 1)

    num_vars, num_mods, var_prog_q, mod_prog_q = (0, 0, queue.Queue(),
                                                  queue.Queue())
    if mh.VAR_NAME in outputs:
        vars_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_VAR_NAME)
        agg_vars = variants.AggVars(vars_db_fn, load_in_mem_indices=False)
        num_vars = agg_vars.num_uniq()
        ref_names_and_lens = agg_vars.vars_db.get_all_chrm_and_lens()
        agg_vars.close()
        LOGGER.info('Spawning variant aggregation processes.')
        # create process to collect var stats from workers
        var_stats_q, var_stats_p, main_var_stats_conn = mh.create_getter_q(
            _get_var_stats_queue,
            (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp))
        # create process to fill variant locs queue
        var_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        var_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(var_filler_q, vars_db_fn,
                                        variants.AggVars, num_ps),
                                  daemon=True)
        var_filler_p.start()
        # create worker processes to aggregate variants
        var_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_vars_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_vars_worker,
                           args=(var_filler_q, var_stats_q, var_prog_q,
                                 vars_db_fn, write_vcf_lp, het_factors,
                                 call_mode, valid_read_ids),
                           daemon=True)
            p.start()
            agg_vars_ps.append(p)

    if mh.MOD_NAME in outputs:
        mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME)
        agg_mods = mods.AggMods(mods_db_fn, load_in_mem_indices=False)
        mod_long_names = agg_mods.get_mod_long_names()
        num_mods = agg_mods.num_uniq()
        ref_names_and_lens = agg_mods.mods_db.get_all_chrm_and_lens()
        agg_mods.close()
        LOGGER.info('Spawning modified base aggregation processes.')
        # create process to collect mods stats from workers
        mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q(
            _get_mod_stats_queue, (out_dir, mod_long_names, ref_names_and_lens,
                                   out_suffix, write_mod_lp, mod_output_fmts))
        # create process to fill mod locs queue
        mod_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        mod_fill_limit = _N_MOD_PROF if _DO_PROF else None
        mod_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(mod_filler_q, mods_db_fn, mods.AggMods,
                                        num_ps, mod_fill_limit),
                                  daemon=True)
        mod_filler_p.start()
        # create worker processes to aggregate mods
        mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_mods_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_mods_worker,
                           args=(mod_filler_q, mod_stats_q, mod_prog_q,
                                 mods_db_fn, mod_agg_info, valid_read_ids,
                                 write_mod_lp),
                           daemon=True)
            p.start()
            agg_mods_ps.append(p)

    # create progress process
    LOGGER.info(
        ('Aggregating {} variants and {} modified base sites over reads.\n' +
         '\t\tNOTE: If this step is very slow, ensure the output directory ' +
         'is located on a fast read disk (e.g. local SSD). Aggregation can ' +
         'be restarted using the megalodon/scripts/run_aggregation.py ' +
         'script.').format(num_vars, num_mods))
    main_prog_conn, prog_conn = mp.Pipe()
    prog_p = mp.Process(target=_agg_prog_worker,
                        args=(var_prog_q, mod_prog_q, num_vars, num_mods,
                              prog_conn, suppress_progress),
                        daemon=True)
    prog_p.start()

    # join filler processes first
    if mh.VAR_NAME in outputs:
        var_filler_p.join()
        for agg_vars_p in agg_vars_ps:
            agg_vars_p.join()
        # send to conn
        if var_stats_p.is_alive():
            main_var_stats_conn.send(True)
        var_stats_p.join()
    if mh.MOD_NAME in outputs:
        for agg_mods_p in agg_mods_ps:
            agg_mods_p.join()
        if mod_stats_p.is_alive():
            main_mod_stats_conn.send(True)
        mod_stats_p.join()
    if prog_p.is_alive():
        main_prog_conn.send(True)
        prog_p.join()

    return