def make_basecall_input_multi(fast5_files, section='template', window=[-1, 0, 1], trim=10, min_len=1000, max_len=9000, event_detect=True, ed_params={'window_lengths':[3, 6], 'thresholds':[1.4, 1.1], 'peak_height':0.2}, sloika_model=False): """Like the above, but doesn't yields directly events. The point here is to be fully consistent with the currennt interface but allow use of the python library """ for f in fast5_files: with Fast5(f) as fh: if event_detect: # These parameters make no sense to me, but hey-ho # TODO: expose to user events = minknow_event_detect( fh.get_read(raw=True), fh.sample_rate, **ed_params ) else: events = fh.get_read() events, _ = segment(events, section=section) try: X = events_to_features(events, window=window, sloika_model=sloika_model) except TypeError: continue try: X = X[trim:-trim] events = events[trim:-trim] except: continue else: if len(X) < min_len or len(X) > max_len: continue yield f, X, events
def create_minknow_event_table(signal, sampling_freq, start_time, window_lengths=(16, 40), thresholds=(8.0, 4.0), peak_height=1.0): """Create new event table using minknow_event_detect event detection :param signal: list or array of signal in pA for finding events :param sampling_freq: sampling frequency of ADC in Hz :param start_time: start time from fast5 file (time in seconds * sampling frequency :param window_lengths: t-test windows for minknow_event_detect :param thresholds: t-test thresholds for minknow_event_detect :param peak_height: peak height param for minknow_event_detect :return: Table of events without model state or move information """ assert np.sign( start_time) == 1, "Start time has to be positive: {}".format( start_time) assert type( signal[0]) is np.float64, "Signal needs to be in pA. Not ADC counts" events = minknow_event_detect(np.asarray(signal, dtype=float), sample_rate=sampling_freq, get_peaks=False, window_lengths=window_lengths, thresholds=thresholds, peak_height=peak_height) num_events = len(events) event_table = np.empty(num_events, dtype=[('start', float), ('length', float), ('mean', float), ('stdv', float), ('model_state', 'S5'), ('move', '<i4'), ('raw_start', int), ('raw_length', int), ('p_model_state', float)]) for i, event in enumerate(events): event_table['start'][i] = event["start"] + (start_time / sampling_freq) event_table['length'][i] = event["length"] event_table['mean'][i] = event["mean"] event_table['stdv'][i] = event["stdv"] event_table['raw_start'][i] = np.round(event["start"] * sampling_freq) event_table['raw_length'][i] = np.round(event["length"] * sampling_freq) return event_table
def basecall_file(fname=None, event_detect=True): """Read event data from file and print scrappie basecall. :param fname: filename to read data from (if not given assumed to be given on command line. :param event_detect: do event detection? :returns: tuple (basecall score, sequence). """ is_main = False if fname is None: #called as entrypoint fname = sys.argv[1] is_main = True # magic numbers ed_params = { 'window_lengths':[4, 8], 'thresholds': [1.5, 9.0], 'peak_height': 0.2, } with Fast5(fname) as fh: if event_detect: events = minknow_event_detect( fh.get_read(raw=True), fh.sample_rate, **ed_params ) else: events = fh.get_read() events, _ = segment(events, section='template') results = basecall_events(events) if results is None: return None if is_main: print("{} score={}\n{}".format(fname, *results)) else: return results
def basecall_file(fname=None, event_detect=True): """Read event data from file and print scrappie basecall. :param fname: filename to read data from (if not given assumed to be given on command line. :param event_detect: do event detection? :returns: tuple (basecall score, sequence). """ is_main = False if fname is None: #called as entrypoint fname = sys.argv[1] is_main = True # magic numbers ed_params = { 'window_lengths': [4, 8], 'thresholds': [1.5, 9.0], 'peak_height': 0.2, } with Fast5(fname) as fh: if event_detect: events = minknow_event_detect(fh.get_read(raw=True), fh.sample_rate, **ed_params) else: events = fh.get_read() events, _ = segment(events, section='template') results = basecall_events(events) if results is None: return None if is_main: print("{} score={}\n{}".format(fname, *results)) else: return results
def poll_data(port): # align_client = yield from bwa.align_client(align_port) print("POLL DATA") replay_client = yield from replayfast5.replay_client(replay_port) yield from asyncio.sleep(5) start_time = now() target_count = 0 flag_array = [] # initiallize the flag and left over events arrays for i in range(0, 513): flag_array.append(0) left_over_events.append([]) print("Before while loop") while True: time_saved = yield from replay_client.call.time_saved() total_pore_time = (now() - start_time) * len(channels) total_strand_time = yield from replay_client.call.cumulative_good_read_time( ) try: pore_time_saved = time_saved / total_pore_time except: pore_time_saved = 0 try: strand_time_saved = time_saved / total_strand_time except: strand_time_saved = 0 logger.info( "Total pore time saved: {:.2f}% [{:.2f}/{:.2f}]".format( 100 * pore_time_saved, time_saved, total_pore_time)) logger.info( "Total strand time saved: {:.2f}% [{:.2f}/{:.2f}]".format( 100 * strand_time_saved, time_saved, total_strand_time)) reads_analysed = set(identified_reads.keys()) | set( unidentified_reads.keys()) all_reads = yield from replay_client.call.total_good_reads() ided = len(identified_reads) unided = len(unidentified_reads) missed = all_reads - len(reads_analysed) logger.info( "identified/unidentified/missed reads: {}/{}/{}.".format( ided, unided, missed)) logger.info("Unblocks (timely/late): {}/{}.".format( unblocks[True], unblocks[False])) logger.info("Total good reads: {}".format(target_count)) print("Before channel loop") for channel in channels: channel_num = int(channel) # print(channel_num) read_block = yield from replay_client.call.get_raw(channel) if read_block is None: logger.debug("Channel not in '{}' classification".format( good_class)) #Reset boolean array here since we're not reading in any data from this pore anymore (Set flag to empty) flag_array[channel_num] = flag.Empty.value num_blocks_read[channel_num] = 0 num_query_read[channel_num] = 0 left_over_events[channel_num] = [] # elif read_block.info in identified_reads: # logger.debug("Skipping because I've seen before.") # continue else: logger.debug("Analysing {} samples".format( len(read_block))) sample_rate = read_block.sample_rate #pico amperage data events = minknow_event_detect( read_block, read_block.sample_rate, **{ 'window_lengths': [3, 6], 'thresholds': [1.4, 1.1], 'peak_height': 0.2 }) # get events from what has been read in by the MinION list_events = events.tolist() events = [] for element in list_events: events.append(float(element[2])) # store any leftover events left_over_events[channel_num].extend(events) total_events = left_over_events[channel_num] # print("Type of total events") # print(type(total_events)) try: len(total_events) > block_size except Exception as e: print("Exception!") print(e) total_events = [] # check if the total events read in are greater than the block size if len(total_events) > block_size: # run while the length of the events is greater than the block_size while len(total_events) > block_size: # get the correct number of events from the total events block_events = total_events[0:block_size] # print(block_events) # put the remainder of the events back in total events total_events = total_events[block_size + 1:len(total_events)] # if the channel was empty before if flag_array[channel_num] == flag.Empty.value: flag_array[ channel_num] = flag.Instrand_check.value num_blocks_read[channel_num] = 1 # if the channel is supposed to be checked elif flag_array[ channel_num] == flag.Instrand_check.value: num_blocks_read[channel_num] = num_blocks_read[ channel_num] + 1 # if the channel is supposed to be ignored elif flag_array[ channel_num] == flag.Instrand_ignore.value: logger.info( "Reading data but ignoring pore: {}". format(channel_num)) continue # if the channel is supposed to be cleared elif flag_array[ channel_num] == flag.Clearing.value: logger.info( "Clearning Pore: {}".format(channel_num)) continue # add a task with the correct block size dtw_queue.add_task(dtwjob.dtw_job, block_events, warp, channel_num, len(block_events), disc_rate, logger, replay_client, num_blocks_read[channel_num], max_num_blocks, selection_type, channel, read_block, num_query_read[channel_num], max_dev) num_query_read[channel_num] = num_query_read[ channel_num] + block_size # put over all the left over events that weren't used in the correct channel number position left_over_events[channel_num] = total_events # if there is the correct number of events in the block elif len(total_events) == block_size: if flag_array[channel_num] == flag.Empty.value: flag_array[channel_num] = flag.Instrand_check.value num_blocks_read[channel_num] = 1 elif flag_array[ channel_num] == flag.Instrand_check.value: num_blocks_read[ channel_num] = num_blocks_read[channel_num] + 1 elif flag_array[ channel_num] == flag.Instrand_ignore.value: logger.info( "Reading data but ignoring pore: {}".format( channel_num)) continue elif flag_array[channel_num] == flag.Clearing.value: logger.info( "Clearning Pore: {}".format(channel_num)) continue dtw_queue.add_task( dtwjob.dtw_job, total_events, warp, channel_num, len(block_events), disc_rate, logger, replay_client, num_blocks_read[channel_num], max_num_blocks, selection_type, channel, read_block, num_query_read[channel_num], max_dev) num_query_read[channel_num] = num_query_read[ channel_num] + block_size left_over_events[channel_num] = [] # if there are less events than the block size should be elif len(total_events) < block_size: left_over_events[channel_num] = total_events
def poll_data(port): align_client = yield from bwa.align_client(align_port) replay_client = yield from replayfast5.replay_client(replay_port) yield from asyncio.sleep(5) start_time = now() target_count = 0 while True: time_saved = yield from replay_client.call.time_saved() total_pore_time = (now() - start_time) * len(channels) total_strand_time = yield from replay_client.call.cumulative_good_read_time( ) try: pore_time_saved = time_saved / total_pore_time except: pore_time_saved = 0 try: strand_time_saved = time_saved / total_strand_time except: strand_time_saved = 0 logger.info( "Total pore time saved: {:.2f}% [{:.2f}/{:.2f}]".format( 100 * pore_time_saved, time_saved, total_pore_time)) logger.info( "Total strand time saved: {:.2f}% [{:.2f}/{:.2f}]".format( 100 * strand_time_saved, time_saved, total_strand_time)) reads_analysed = set(identified_reads.keys()) | set( unidentified_reads.keys()) all_reads = yield from replay_client.call.total_good_reads() ided = len(identified_reads) unided = len(unidentified_reads) missed = all_reads - len(reads_analysed) logger.info( "identified/unidentified/missed reads: {}/{}/{}.".format( ided, unided, missed)) logger.info("Unblocks (timely/late): {}/{}.".format( unblocks[True], unblocks[False])) logger.info("Total good reads: {}".format(target_count)) for channel in channels: read_block = yield from replay_client.call.get_raw(channel) if read_block is None: logger.debug("Channel not in '{}' classification".format( good_class)) elif read_block.info in identified_reads: logger.debug("Skipping because I've seen before.") continue else: logger.debug("Analysing {} samples".format( len(read_block))) sample_rate = read_block.sample_rate events = minknow_event_detect( read_block, read_block.sample_rate, **{ 'window_lengths': [3, 6], 'thresholds': [1.4, 1.1], 'peak_height': 0.2 }) if len(events) < 100: continue #TODO: do this in a process pool score, basecall = pyscrap.basecall_events(events) #TODO: check sanity of basecall if len(basecall) < 100: continue alignment, returncode = yield from align_client.call.align( basecall) hits = [] if returncode != 0: logger.warning('Alignment failed for {}'.format( read_block.info)) else: recs = [ x for x in alignment.split('\n') if len(x) > 0 and x[0] != '@' ] for r in recs: fields = r.split('\t') if fields[2] != '*': hits.append(fields[2]) logger.debug('{} aligns to {}'.format( read_block.info, hits)) if len(hits) == 1: identified_reads[read_block.info] = hits[0] # maybe got 0 or >1 previously #TODO: there are some edges cases here try: del unidentified_reads[read_block.info] except KeyError: pass else: unidentified_reads[read_block.info].extend(hits) if read_block.info in identified_reads: good_read = whitelist if identified_reads[read_block.info] not in targets: good_read = not whitelist if not good_read: logger.info( 'Attempting to unblock channel {} due to contaminant.' .format(channel)) _, good_unblock = yield from replay_client.call.unblock( channel, read_block.info, read_block.end) unblocks[good_unblock] += 1 else: target_count += 1