Ejemplo n.º 1
0
def make_basecall_input_multi(fast5_files, section='template', window=[-1, 0, 1], trim=10, min_len=1000, max_len=9000,
    event_detect=True, ed_params={'window_lengths':[3, 6], 'thresholds':[1.4, 1.1], 'peak_height':0.2}, sloika_model=False):
    """Like the above, but doesn't yields directly events. The point here is to
    be fully consistent with the currennt interface but allow use of the python
    library
    """
    for f in fast5_files:
        with Fast5(f) as fh:
            if event_detect:
                # These parameters make no sense to me, but hey-ho
                # TODO: expose to user
                events = minknow_event_detect(
                    fh.get_read(raw=True), fh.sample_rate, **ed_params
                )
            else:
                events = fh.get_read()
            events, _ = segment(events, section=section) 
        try:
            X = events_to_features(events, window=window, sloika_model=sloika_model)
        except TypeError:
            continue
        try:
            X = X[trim:-trim]
            events = events[trim:-trim]
        except:
            continue
        else:
            if len(X) < min_len or len(X) > max_len:
                continue
        yield f, X, events
Ejemplo n.º 2
0
def create_minknow_event_table(signal,
                               sampling_freq,
                               start_time,
                               window_lengths=(16, 40),
                               thresholds=(8.0, 4.0),
                               peak_height=1.0):
    """Create new event table using minknow_event_detect event detection

    :param signal: list or array of signal in pA for finding events
    :param sampling_freq: sampling frequency of ADC in Hz
    :param start_time: start time from fast5 file (time in seconds * sampling frequency
    :param window_lengths: t-test windows for minknow_event_detect
    :param thresholds: t-test thresholds for minknow_event_detect
    :param peak_height: peak height param for minknow_event_detect
    :return: Table of events without model state or move information
    """
    assert np.sign(
        start_time) == 1, "Start time has to be positive: {}".format(
            start_time)
    assert type(
        signal[0]) is np.float64, "Signal needs to be in pA. Not ADC counts"
    events = minknow_event_detect(np.asarray(signal, dtype=float),
                                  sample_rate=sampling_freq,
                                  get_peaks=False,
                                  window_lengths=window_lengths,
                                  thresholds=thresholds,
                                  peak_height=peak_height)
    num_events = len(events)
    event_table = np.empty(num_events,
                           dtype=[('start', float), ('length', float),
                                  ('mean', float), ('stdv', float),
                                  ('model_state', 'S5'), ('move', '<i4'),
                                  ('raw_start', int), ('raw_length', int),
                                  ('p_model_state', float)])
    for i, event in enumerate(events):
        event_table['start'][i] = event["start"] + (start_time / sampling_freq)
        event_table['length'][i] = event["length"]
        event_table['mean'][i] = event["mean"]
        event_table['stdv'][i] = event["stdv"]
        event_table['raw_start'][i] = np.round(event["start"] * sampling_freq)
        event_table['raw_length'][i] = np.round(event["length"] *
                                                sampling_freq)

    return event_table
Ejemplo n.º 3
0
def basecall_file(fname=None, event_detect=True):
    """Read event data from file and print scrappie basecall.

    :param fname: filename to read data from (if not given assumed
        to be given on command line.
    :param event_detect: do event detection?

    :returns: tuple (basecall score, sequence).
    """
    is_main = False
    if fname is None: #called as entrypoint
        fname = sys.argv[1]
        is_main = True

    # magic numbers
    ed_params = {
        'window_lengths':[4, 8],
        'thresholds': [1.5, 9.0],
        'peak_height': 0.2,
    }

    with Fast5(fname) as fh:
        if event_detect:
            events = minknow_event_detect(
                fh.get_read(raw=True), fh.sample_rate, **ed_params
            )
        else:
            events = fh.get_read()
    events, _ = segment(events, section='template') 

    results = basecall_events(events)
    if results is None:
        return None
    if is_main:
        print("{} score={}\n{}".format(fname, *results))
    else:
        return results
Ejemplo n.º 4
0
def basecall_file(fname=None, event_detect=True):
    """Read event data from file and print scrappie basecall.

    :param fname: filename to read data from (if not given assumed
        to be given on command line.
    :param event_detect: do event detection?

    :returns: tuple (basecall score, sequence).
    """
    is_main = False
    if fname is None:  #called as entrypoint
        fname = sys.argv[1]
        is_main = True

    # magic numbers
    ed_params = {
        'window_lengths': [4, 8],
        'thresholds': [1.5, 9.0],
        'peak_height': 0.2,
    }

    with Fast5(fname) as fh:
        if event_detect:
            events = minknow_event_detect(fh.get_read(raw=True),
                                          fh.sample_rate, **ed_params)
        else:
            events = fh.get_read()
    events, _ = segment(events, section='template')

    results = basecall_events(events)
    if results is None:
        return None
    if is_main:
        print("{} score={}\n{}".format(fname, *results))
    else:
        return results
Ejemplo n.º 5
0
    def poll_data(port):
        # align_client = yield from bwa.align_client(align_port)
        print("POLL DATA")
        replay_client = yield from replayfast5.replay_client(replay_port)
        yield from asyncio.sleep(5)
        start_time = now()
        target_count = 0
        flag_array = []
        # initiallize the flag and left over events arrays
        for i in range(0, 513):
            flag_array.append(0)
            left_over_events.append([])
        print("Before while loop")
        while True:
            time_saved = yield from replay_client.call.time_saved()
            total_pore_time = (now() - start_time) * len(channels)
            total_strand_time = yield from replay_client.call.cumulative_good_read_time(
            )
            try:
                pore_time_saved = time_saved / total_pore_time
            except:
                pore_time_saved = 0
            try:
                strand_time_saved = time_saved / total_strand_time
            except:
                strand_time_saved = 0
            logger.info(
                "Total pore time saved: {:.2f}% [{:.2f}/{:.2f}]".format(
                    100 * pore_time_saved, time_saved, total_pore_time))
            logger.info(
                "Total strand time saved: {:.2f}% [{:.2f}/{:.2f}]".format(
                    100 * strand_time_saved, time_saved, total_strand_time))
            reads_analysed = set(identified_reads.keys()) | set(
                unidentified_reads.keys())
            all_reads = yield from replay_client.call.total_good_reads()
            ided = len(identified_reads)
            unided = len(unidentified_reads)
            missed = all_reads - len(reads_analysed)
            logger.info(
                "identified/unidentified/missed reads: {}/{}/{}.".format(
                    ided, unided, missed))
            logger.info("Unblocks (timely/late): {}/{}.".format(
                unblocks[True], unblocks[False]))
            logger.info("Total good reads: {}".format(target_count))

            print("Before channel loop")

            for channel in channels:
                channel_num = int(channel)
                # print(channel_num)
                read_block = yield from replay_client.call.get_raw(channel)
                if read_block is None:
                    logger.debug("Channel not in '{}' classification".format(
                        good_class))
                    #Reset boolean array here since we're not reading in any data from this pore anymore (Set flag to empty)
                    flag_array[channel_num] = flag.Empty.value
                    num_blocks_read[channel_num] = 0
                    num_query_read[channel_num] = 0
                    left_over_events[channel_num] = []
                # elif read_block.info in identified_reads:
                #     logger.debug("Skipping because I've seen before.")
                #     continue
                else:
                    logger.debug("Analysing {} samples".format(
                        len(read_block)))
                    sample_rate = read_block.sample_rate

                    #pico amperage data
                    events = minknow_event_detect(
                        read_block, read_block.sample_rate, **{
                            'window_lengths': [3, 6],
                            'thresholds': [1.4, 1.1],
                            'peak_height': 0.2
                        })

                    # get events from what has been read in by the MinION
                    list_events = events.tolist()
                    events = []
                    for element in list_events:
                        events.append(float(element[2]))

                    # store any leftover events
                    left_over_events[channel_num].extend(events)
                    total_events = left_over_events[channel_num]
                    # print("Type of total events")
                    # print(type(total_events))
                    try:
                        len(total_events) > block_size
                    except Exception as e:
                        print("Exception!")
                        print(e)
                        total_events = []

                    # check if the total events read in are greater than the block size
                    if len(total_events) > block_size:

                        # run while the length of the events is greater than the block_size
                        while len(total_events) > block_size:
                            # get the correct number of events from the total events
                            block_events = total_events[0:block_size]
                            # print(block_events)
                            # put the remainder of the events back in total events
                            total_events = total_events[block_size +
                                                        1:len(total_events)]
                            # if the channel was empty before
                            if flag_array[channel_num] == flag.Empty.value:
                                flag_array[
                                    channel_num] = flag.Instrand_check.value
                                num_blocks_read[channel_num] = 1
                            # if the channel is supposed to be checked
                            elif flag_array[
                                    channel_num] == flag.Instrand_check.value:
                                num_blocks_read[channel_num] = num_blocks_read[
                                    channel_num] + 1
                            # if the channel is supposed to be ignored
                            elif flag_array[
                                    channel_num] == flag.Instrand_ignore.value:
                                logger.info(
                                    "Reading data but ignoring pore: {}".
                                    format(channel_num))
                                continue
                            # if the channel is supposed to be cleared
                            elif flag_array[
                                    channel_num] == flag.Clearing.value:
                                logger.info(
                                    "Clearning Pore: {}".format(channel_num))
                                continue
                            # add a task with the correct block size
                            dtw_queue.add_task(dtwjob.dtw_job, block_events,
                                               warp, channel_num,
                                               len(block_events), disc_rate,
                                               logger, replay_client,
                                               num_blocks_read[channel_num],
                                               max_num_blocks, selection_type,
                                               channel, read_block,
                                               num_query_read[channel_num],
                                               max_dev)
                            num_query_read[channel_num] = num_query_read[
                                channel_num] + block_size
                        # put over all the left over events that weren't used in the correct channel number position
                        left_over_events[channel_num] = total_events
                    # if there is the correct number of events in the block
                    elif len(total_events) == block_size:
                        if flag_array[channel_num] == flag.Empty.value:
                            flag_array[channel_num] = flag.Instrand_check.value
                            num_blocks_read[channel_num] = 1
                        elif flag_array[
                                channel_num] == flag.Instrand_check.value:
                            num_blocks_read[
                                channel_num] = num_blocks_read[channel_num] + 1
                        elif flag_array[
                                channel_num] == flag.Instrand_ignore.value:
                            logger.info(
                                "Reading data but ignoring pore: {}".format(
                                    channel_num))
                            continue
                        elif flag_array[channel_num] == flag.Clearing.value:
                            logger.info(
                                "Clearning Pore: {}".format(channel_num))
                            continue

                        dtw_queue.add_task(
                            dtwjob.dtw_job, total_events, warp, channel_num,
                            len(block_events), disc_rate, logger,
                            replay_client, num_blocks_read[channel_num],
                            max_num_blocks, selection_type, channel,
                            read_block, num_query_read[channel_num], max_dev)
                        num_query_read[channel_num] = num_query_read[
                            channel_num] + block_size
                        left_over_events[channel_num] = []
                    # if there are less events than the block size should be
                    elif len(total_events) < block_size:
                        left_over_events[channel_num] = total_events
Ejemplo n.º 6
0
    def poll_data(port):
        align_client = yield from bwa.align_client(align_port)
        replay_client = yield from replayfast5.replay_client(replay_port)
        yield from asyncio.sleep(5)
        start_time = now()
        target_count = 0
        while True:
            time_saved = yield from replay_client.call.time_saved()
            total_pore_time = (now() - start_time) * len(channels)
            total_strand_time = yield from replay_client.call.cumulative_good_read_time(
            )
            try:
                pore_time_saved = time_saved / total_pore_time
            except:
                pore_time_saved = 0
            try:
                strand_time_saved = time_saved / total_strand_time
            except:
                strand_time_saved = 0
            logger.info(
                "Total pore time saved: {:.2f}% [{:.2f}/{:.2f}]".format(
                    100 * pore_time_saved, time_saved, total_pore_time))
            logger.info(
                "Total strand time saved: {:.2f}% [{:.2f}/{:.2f}]".format(
                    100 * strand_time_saved, time_saved, total_strand_time))
            reads_analysed = set(identified_reads.keys()) | set(
                unidentified_reads.keys())
            all_reads = yield from replay_client.call.total_good_reads()
            ided = len(identified_reads)
            unided = len(unidentified_reads)
            missed = all_reads - len(reads_analysed)
            logger.info(
                "identified/unidentified/missed reads: {}/{}/{}.".format(
                    ided, unided, missed))
            logger.info("Unblocks (timely/late): {}/{}.".format(
                unblocks[True], unblocks[False]))
            logger.info("Total good reads: {}".format(target_count))

            for channel in channels:
                read_block = yield from replay_client.call.get_raw(channel)
                if read_block is None:
                    logger.debug("Channel not in '{}' classification".format(
                        good_class))
                elif read_block.info in identified_reads:
                    logger.debug("Skipping because I've seen before.")
                    continue
                else:
                    logger.debug("Analysing {} samples".format(
                        len(read_block)))
                    sample_rate = read_block.sample_rate
                    events = minknow_event_detect(
                        read_block, read_block.sample_rate, **{
                            'window_lengths': [3, 6],
                            'thresholds': [1.4, 1.1],
                            'peak_height': 0.2
                        })
                    if len(events) < 100:
                        continue

                    #TODO: do this in a process pool
                    score, basecall = pyscrap.basecall_events(events)
                    #TODO: check sanity of basecall
                    if len(basecall) < 100:
                        continue

                    alignment, returncode = yield from align_client.call.align(
                        basecall)
                    hits = []
                    if returncode != 0:
                        logger.warning('Alignment failed for {}'.format(
                            read_block.info))
                    else:
                        recs = [
                            x for x in alignment.split('\n')
                            if len(x) > 0 and x[0] != '@'
                        ]
                        for r in recs:
                            fields = r.split('\t')
                            if fields[2] != '*':
                                hits.append(fields[2])
                    logger.debug('{} aligns to {}'.format(
                        read_block.info, hits))

                    if len(hits) == 1:
                        identified_reads[read_block.info] = hits[0]
                        # maybe got 0 or >1 previously
                        #TODO: there are some edges cases here
                        try:
                            del unidentified_reads[read_block.info]
                        except KeyError:
                            pass
                    else:
                        unidentified_reads[read_block.info].extend(hits)

                    if read_block.info in identified_reads:
                        good_read = whitelist
                        if identified_reads[read_block.info] not in targets:
                            good_read = not whitelist

                        if not good_read:
                            logger.info(
                                'Attempting to unblock channel {} due to contaminant.'
                                .format(channel))
                            _, good_unblock = yield from replay_client.call.unblock(
                                channel, read_block.info, read_block.end)
                            unblocks[good_unblock] += 1
                        else:
                            target_count += 1