def processfiles(self): self.logger.info("Process Files Inititated") self.counter = 0 self.targets = [] self.masterdf = pd.DataFrame(columns=['seqid', 'position', 'coverage']) while self.running: currenttime = time.time() #for fastqfile, createtime in tqdm(sorted(self.creates.items(), key=lambda x: x[1])): fastqfilelist = list() for fastqfile, createtime in sorted(self.creates.items(), key=lambda x: x[1]): delaytime = 0 # file created 5 sec ago, so should be complete. For simulations we make the time longer. if (int(createtime) + delaytime < time.time()): self.logger.info(fastqfile) del self.creates[fastqfile] self.counter += 1 fastqfilelist.append(fastqfile) #print (fastqfile,md5Checksum(fastqfile), "\n\n\n\n") targets, self.masterdf = parse_fastq_file(fastqfilelist, self.args, logging, self.masterdf) print(targets) print(self.targets) if len(targets) > len(self.targets): updated_targets = set(targets) - set(self.targets) update_message = "Updating targets with {}".format( nice_join(updated_targets, conjunction="and")) self.logger.info(update_message) if not self.args.simulation: send_message(self.connection, update_message, Severity.WARN) write_new_toml(self.args, targets) self.targets = [] self.targets = targets.copy() if self.masterdf.shape[0] > 0 and self.masterdf.shape[0] == len( self.targets): # Every target is covered at the desired coverage level. self.logger.info( "Every target is covered at at least {}x".format( self.args.depth)) if not self.args.simulation: self.connection.protocol.stop_protocol() send_message( self.connection, "Iter Align has stopped the run as all targets should be covered by at least {}x" .format(self.args.depth), Severity.WARN, ) #parse_fastq_file(fastqfile, self.rundict, self.fastqdict, self.args, self.header, self.MinotourConnection) #self.args.files_processed += 1 if currenttime + 5 > time.time(): time.sleep(5)
def simple_analysis(client, batch_size=512, throttle=0.1, unblock_duration=0.1): """Analysis function Parameters ---------- client : read_until.ReadUntilClient An instance of the ReadUntilClient object batch_size : int The number of reads to be retrieved from the ReadUntilClient at a time throttle : int or float The number of seconds interval between requests to the ReadUntilClient unblock_duration : int or float Time, in seconds, to apply unblock voltage Returns ------- None """ logger = logging.getLogger(__name__) send_message( client.connection, "Read Until sending Unblock All Messages. All reads will be prematurely truncated. This will affect a live sequencing run.", Severity.WARN) while client.is_running: r = 0 t0 = timer() for r, (channel, read) in enumerate( client.get_read_chunks( batch_size=batch_size, last=True, ), start=1, ): # pass client.unblock_read(channel, read.number, read_id=read.id, duration=unblock_duration) client.stop_receiving_read(channel, read.number) t1 = timer() if r: logger.info("Took {:.6f} for {} reads".format(t1 - t0, r)) # limit the rate at which we make requests if t0 + throttle > t1: time.sleep(throttle + t0 - t1) else: send_message(client.connection, "Read Until Unblock All Disconnected.", Severity.WARN) logger.info("Finished analysis of reads as client stopped.")
def run(parser, args): args.tomlfile = args.toml args.toml = toml.load(args.toml) print(args) # TODO: Move logging config to separate configuration file # set up logging to file logging.basicConfig( level=logging.DEBUG, format='%(levelname)s::%(asctime)s::%(name)s::%(message)s', filename=args.log_file, filemode='w') # define a Handler that writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter('%(name)-15s: %(levelname)-8s %(message)s') console.setFormatter(formatter) # add the handler to the root logger logging.getLogger('').addHandler(console) # Start by logging sys.argv and the parameters used logger = logging.getLogger("Manager") logger.info(" ".join(sys.argv)) print_args(args, logger=logger) logger.info("Initialising iterAlign.") logger.info("Setting up FastQ monitoring.") #### Check if a run is active - if not, wait. args.simulation = True connection = None if args.watch is None: args.simulation = False logger.info("Creating rpc connection for device {}.".format( args.device)) try: connection, messageport = get_rpc_connection(args.device) except ValueError as e: print(e) sys.exit(1) send_message(connection, "Iteralign Connected to MinKNOW", Severity.WARN) logger.info("Loaded RPC") while parse_message(connection.acquisition.current_status() )['status'] != "PROCESSING": time.sleep(1) #### Check if we know where data is being written to , if not... wait args.watch = parse_message(connection.acquisition.get_acquisition_info( ))['config_summary']['reads_directory'] else: messageport = "" event_handler = FastqHandler(args, logging, messageport, connection) # This block handles the fastq observer = Observer() observer.schedule(event_handler, path=args.watch, recursive=True) observer.daemon = True try: observer.start() logger.info("FastQ Monitoring Running.") while 1: time.sleep(1) except KeyboardInterrupt: logger.info("Exiting - Will take a few seconds to clean up!") observer.stop() observer.join() os._exit(0)
def run(parser, args): # new code block: change the reference path within the args.toml file into the args.mindex path d = toml.load(args.toml) print(d["conditions"]["reference"]) args.tomlfile = args.toml args.toml = toml.load(args.toml) print(args) # TODO: Move logging config to separate configuration file # set up logging to file logging.basicConfig(level=logging.DEBUG, format='%(levelname)s::%(asctime)s::%(name)s::%(message)s', filename=args.log_file, filemode='w') # define a Handler that writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter('%(name)-15s: %(levelname)-8s %(message)s') console.setFormatter(formatter) # add the handler to the root logger logging.getLogger('').addHandler(console) # Start by logging sys.argv and the parameters used logger = logging.getLogger("Manager") logger.info(" ".join(sys.argv)) print_args(args, logger=logger) logger.info("Initialising iterAlign.") logger.info("Setting up FastQ monitoring.") #### Check if a run is active - if not, wait. args.simulation = True connection = None #set default message severity level. severity = 2 if args.watch is None: args.simulation = False logger.info("Creating rpc connection for device {}.".format(args.device)) try: connection, messageport = get_rpc_connection(args.device) except ValueError as e: print(e) sys.exit(1) #send_message_port("Iteralign Connected to MinKNOW", args.host, messageport) send_message(connection, "Iteralign Connected to MinKNOW.", Severity.WARN) logger.info("Loaded RPC") while parse_message(connection.acquisition.current_status())['status'] != "PROCESSING": time.sleep(1) ### Check if we know where data is being written to , if not... wait args.watch = parse_message(connection.acquisition.get_acquisition_info())['config_summary'][ 'reads_directory'] else: messageport = "" event_handler = FastqHandler(args, logging, messageport, connection) # This block handles the fastq observer = Observer() observer.schedule(event_handler, path=args.watch, recursive=True) observer.daemon = True try: observer.start() logger.info("FastQ Monitoring Running.") while 1: time.sleep(1) except KeyboardInterrupt: logger.info("Exiting - Will take a few seconds to clean up!") observer.stop() observer.join() if args.keepfiles: logging.info("The 'keepfiles' argument was set, files generated by classifier have been retained") else: if os.path.isdir(args.path): for path, dirs, files in os.walk(args.path): for f in files: if f.startswith(args.prefix): os.unlink(f) logging.info("file removed: {}".format(f)) if os.path.isdir("./"): for path, dirs, files in os.walk("./"): for f in files: if f.endswith(args.creport): os.unlink(f) logging.info("file removed: {}".format(f)) logging.info("All files generated by classifier have been removed.") os._exit(0)
def processfiles(self): self.logger.info("Process Files Inititated") self.counter = 1 self.targets = [] self.masterdf = pd.DataFrame(columns=['seqid', 'position', 'coverage']) self.taxid_entries = 0 self.downloaded_set = set() self.length_dict = {} self.coverage_sum = {} if self.args.references: logging.info("References argument provided. Will download references genomes.") self.downloaded_set = set(self.args.references) logging.info(self.downloaded_set) self.url_list = url_list_generation(self.args, self.args.references) self.length_dict.update(download_references(self.args, self.url_list, self.downloaded_set)) generate_mmi(self.args, self.counter) while self.running: currenttime = time.time() # for fastqfile, createtime in tqdm(sorted(self.creates.items(), key=lambda x: x[1])): fastqfilelist = list() for fastqfile, createtime in sorted(self.creates.items(), key=lambda x: x[1]): delaytime = 0 # file created 5 sec ago, so should be complete. For simulations we make the time longer. if (int(createtime) + delaytime < time.time()): self.logger.info(fastqfile) del self.creates[fastqfile] self.counter += 1 fastqfilelist.append(fastqfile) # print (fastqfile,md5Checksum(fastqfile), "\n\n\n\n") # as long as there are files within the args.watch directory to parse if fastqfilelist: print(self.downloaded_set) targets, self.downloaded_set, self.taxid_entries, self.coverage_sum = parse_fastq_file(fastqfilelist, self.args, logging, self.length_dict, self.downloaded_set, self.taxid_entries, self.coverage_sum, self.connection) print(targets) print(self.targets) if len(targets) > len(self.targets): updated_targets = set(targets) - set(self.targets) update_message = "Updating targets with {}".format(nice_join(updated_targets, conjunction="and")) self.logger.info(update_message) if not self.args.simulation: #send_message_port(update_message, self.args.host, self.messageport) send_message(self.connection, update_message, Severity.WARN) write_new_toml(self.args, targets) self.targets = [] self.targets = targets.copy() if self.masterdf.shape[0] > 0 and self.masterdf.shape[0] == len(self.targets): # Every target is covered at the desired coverage level. self.logger.info("Every target is covered at at least {}x".format(self.args.depth)) if not self.args.simulation: self.connection.protocol.stop_protocol() #send_message_port( # "Iter Align has stopped the run as all targets should be covered by at least {}x".format( # self.args.depth), self.args.host, self.messageport) send_message(self.connection, "Iter Align has stopped the run as all targets should be covered by at least {}x".format( self.args.depth), Severity.WARN) # parse_fastq_file(fastqfile, self.rundict, self.fastqdict, self.args, self.header, self.MinotourConnection) # self.args.files_processed += 1 if currenttime + 5 > time.time(): time.sleep(5)
def parse_fastq_file(fastqfileList, args, logging, length_dict, taxID_set, counter, coverage_sum, connection): logger = logging.getLogger("ParseFastq") logger.info(fastqfileList) logger.info(args.toml['conditions']['reference']) with open(os.devnull, 'w') as devnull: # convert the 'fastqfileList' into a string valid for the list of fastq files to be read by centrifuge fastq_str = ",".join(fastqfileList) # centrifuge command to classify reads in the fastq files found by watchdog centrifuge_cmd = "centrifuge -p {} -x {} -q {}".format(args.threads, args.cindex, fastq_str ) # show what the centrifuge command in the terminal logging.info(centrifuge_cmd) # start time of centrifuge to track the time centrifuge requires to classify reads centrifuge_start_time = time.time() # subprocess for 'centrifuge_cmd' proc = subprocess.Popen( centrifuge_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True, # Aliased by `text=True` in 3.7 universal_newlines=True, ) out, err = proc.communicate() proc.stdout.close() # confirm that the centrifuge pipeline has finished and show the time of execusion logging.info("Post centrifuge run: {} seconds".format(time.time()-centrifuge_start_time)) output_fields = ["readID", "seqID", "taxID", "hitLength", "queryLength", "numMatches"] # create the DataFrame from the centrifuge output using 'output_fields' as the column headers out_df = pd.read_csv(StringIO(out), sep="\t", usecols=output_fields ) # create a dataframe from the args.creport 'centrifuge_report.tsv' report_df = pd.read_csv(args.creport, sep="\t", usecols=["name", "genomeSize", "taxID"], ) # merge both dataframes together new_df = pd.merge(out_df, report_df, on="taxID") # only reads that uniquely align to a species will be used, with reads that numMatches != 1 being filtered reject_df = new_df[new_df.numMatches != 1] reject_df["reason"] = "Multiple reference genome alignments" # reject_df.loc[:, "reason"] = "Multiple reference genome alignments" # genomeSize == 0 infer a read classification above the species taxon and are therefore removed intermediate_df = new_df[new_df.genomeSize == 0] intermediate_df = intermediate_df[intermediate_df.numMatches == 1] intermediate_df["reason"] = "Read aligns to non-species taxon" # log the reads that were removed into a seperate file reject_df = reject_df.append(intermediate_df) if os.path.isfile(args.path + args.prefix + args.reject): reject_df.to_csv(args.path + args.prefix + args.reject, sep="\t", mode="a", index=False, header=None, ) else: reject_df.to_csv(args.path + args.prefix + args.reject, sep="\t", mode="a", index=False, header=True, ) # reads that uniquely align to one species are logged into a file new_df = new_df[new_df.numMatches == 1] if os.path.isfile(args.path + args.prefix + args.readfile): new_df.to_csv(args.path + args.prefix + args.readfile, sep="\t", mode="a", index=None, header=None, ) else: new_df.to_csv(args.path + args.prefix + args.readfile, sep="\t", mode="a", index=None, header=True, ) logging.info("reject file made") # read in all valid reads back into memory from all current iterations all_reads_df = pd.read_csv(args.path + args.prefix + args.readfile, sep="\t" ) # count the number of reads that were clasified to all taxIDs found within 'all_reads_df' and only keep the taxIDs with a count above args.threshold taxid_count = all_reads_df.groupby("taxID").agg({"taxID": "count"}) taxid_result_set = set(taxid_count[taxid_count["taxID"].ge(args.threshold)].index.values.tolist()) # call in the taxIDs that already have a reference genome in the mmi downloaded_set = taxID_set # if taxIDs were found above args.threshold if taxid_result_set: logging.info("taxIDs found above specified threshold") # locate new taxIDs that were not found in previous iterations difference_set = taxid_result_set - downloaded_set downloaded_set |= taxid_result_set # if novel taxIDs were found if difference_set: with open(args.path + args.prefix + args.tidfile, "w+") as f: d = toml.load(args.path + args.prefix + args.tidfile) if os.path.isfile(args.tidfile): d["taxid"] = {"iteration.{}".format(counter): difference_set} else: d = {"taxid": {"iteration.{}".format(counter): difference_set}} toml.dump(d, f) f.close() logging.info("new taxids found: {}".format(difference_set)) logging.info("Downloading reference genomes") url_list = url_list_generation(args, difference_set) # download the reference genomes into a single file length_dict.update(download_references(args, url_list, difference_set)) logging.info("Generating mmi") generate_mmi(args, counter) update_message = "Updated the minimap MMI to {}".format(args.toml['conditions']['reference']) logging.info(update_message) if not args.simulation: #send_message_port(update_message, args.host, messageport) send_message(connection, update_message, Severity.WARN) else: # show in terminal that no new taxIDs were found logging.info("No new taxIDs were identified") else: logging.info("No taxIDs found above threshold.") # all of the new code has been parsed, the rest follows standard 'iteralign.py' script logging.info("new code parsing completed\n\ncommencing iteralign") minimapcmd = ["minimap2","-ax","map-ont","-t {}".format(args.threads),args.toml['conditions']['reference']] #" ".join(fastqfilelist)] minimapcmd.extend(fastqfileList) logging.info(" ".join(minimapcmd)) minimapoutput = subprocess.Popen(minimapcmd, stdout=subprocess.PIPE,stderr=devnull) samcmd = ["samtools","view", "-bS"] samoutput = subprocess.Popen(samcmd, stdin=minimapoutput.stdout, stdout=subprocess.PIPE, stderr=devnull) #samsortcmd = ["samtools", "sort", "-@2", "-o", "sortedbam.bam"] samsortcmd = ["samtools", "sort", "-@{}".format(args.threads)] samsortoutput = subprocess.Popen(samsortcmd, stdin=samoutput.stdout, stdout=subprocess.PIPE, stderr=devnull) samdepthcmd = ["samtools", "depth", "/dev/stdin"] samdepthoutput = subprocess.Popen(samdepthcmd, stdin=samsortoutput.stdout,stdout=subprocess.PIPE, stderr=devnull, universal_newlines=True) minimapoutput.stdout.close() samoutput.stdout.close() samsortoutput.stdout.close() iter_depth = (l.strip().split("\t") for l in samdepthoutput.stdout) parse_iter = ((x[0], int(x[-1])) for x in iter_depth) d = defaultdict(int) d.update(coverage_sum) for name, depth in parse_iter: d[name] += depth depth_dict = {k: d[k]/length_dict[k] for k in length_dict.keys() & d} with open(args.path + args.prefix + args.coveragefile, "a") as fh: for k, v in depth_dict.items(): fh.write("{}\t{}\t{}\n".format(counter, k, v)) targets = [k for k, v in depth_dict.items() if v > args.depth] logging.info(targets) counter += 1 logging.info("Finished processing {}.".format(" ".join(fastqfileList))) return targets, downloaded_set, counter, d
def simple_analysis( client, batch_size=512, throttle=0.1, unblock_duration=0.5, cl=None, pf=None, live_toml_path=None, flowcell_size=512, dry_run=False, run_info=None, conditions=None, mapper=None, caller_kwargs=None, ): """Analysis function Parameters ---------- client : read_until.ReadUntilClient An instance of the ReadUntilClient object batch_size : int The number of reads to be retrieved from the ReadUntilClient at a time throttle : int or float The number of seconds interval between requests to the ReadUntilClient unblock_duration : int or float Time, in seconds, to apply unblock voltage cl : logging.Logger Log file to log chunk data to pf : logging.Logger Log file to log alignments to live_toml_path : str Path to a `live` TOML configuration file for read until. If this exists when the run starts it will be deleted flowcell_size : int The number of channels on the flowcell, 512 for MinION and 3000 for PromethION dry_run : bool If True unblocks are replaced with `stop_receiving` commands run_info : dict Dictionary of {channel: index} where index corresponds to an index in `conditions` conditions : list Experimental conditions as List of namedtuples. mapper : mappy.Aligner caller_kwargs : dict Returns ------- None """ # Init logger for this function logger = logging.getLogger(__name__) # Delete live TOML file if it exists live_toml_path = Path(live_toml_path) if live_toml_path.is_file(): live_toml_path.unlink() # TODO: test this # Write channels.toml d = { "conditions": { str(v): { "channels": [], "name": conditions[v].name } for k, v in run_info.items() } } for k, v in run_info.items(): d["conditions"][str(v)]["channels"].append(k) channels_out = str(client.mk_run_dir / "channels.toml") with open(channels_out, "w") as fh: fh.write( "# This file is written as a record of the condition each channel is assigned.\n" ) fh.write( "# It may be changed or overwritten if you restart Read Until.\n") fh.write("# In the future this file may become a CSV file.\n") toml.dump(d, fh) caller = Caller(**caller_kwargs) # What if there is no reference or an empty MMI # DefaultDict[int: collections.deque[Tuple[str, ndarray]]] # tuple is (read_id, previous_signal) # TODO: tuple should use read_number instead previous_signal = defaultdict(functools.partial(deque, maxlen=1)) # count how often a read is seen tracker = defaultdict(Counter) # decided decided_reads = {} strand_converter = {1: "+", -1: "-"} read_id = "" # TODO: partial-ise / lambda unblock to take the unblock duration if dry_run: decision_dict = { "stop_receiving": client.stop_receiving_read, "proceed": None, "unblock": client.stop_receiving_read, } send_message(client.connection, "This is a test run. No unblocks will occur.", Severity.WARN) else: decision_dict = { "stop_receiving": client.stop_receiving_read, "proceed": None, "unblock": lambda c, n: client.unblock_read(c, n, unblock_duration, read_id), } send_message(client.connection, "This is a live run. Unblocks will occur.", Severity.WARN) decision_str = "" below_threshold = False exceeded_threshold = False l_string = ( "client_iteration", "read_in_loop", "read_id", "channel", "read_number", "seq_len", "counter", "mode", "decision", "condition", "min_threshold", "count_threshold", "start_analysis", "end_analysis", "timestamp", ) cl.debug("\t".join(l_string)) l_string = "\t".join(("{}" for _ in l_string)) loop_counter = 0 while client.is_running: if live_toml_path.is_file(): # Reload the TOML config from the *_live file run_info, conditions, new_reference, _ = get_run_info( live_toml_path, flowcell_size) # Check the reference path if different from the loaded mapper if new_reference != mapper.index: old_reference = mapper.index # Log to file and MinKNOW interface logger.info("Reloading mapper") send_message(client.connection, "Reloading mapper. Read Until paused.", Severity.INFO) # Update mapper client. mapper = CustomMapper(new_reference) # Log on success logger.info("Reloaded mapper") # If we've reloaded a reference, delete the previous one if old_reference: logger.info("Deleting old mmi {}".format(old_reference)) # We now delete the old mmi file. Path(old_reference).unlink() logger.info("Old mmi deleted.") # TODO: Fix the logging to just one of the two in use if not mapper.initialised: time.sleep(throttle) continue loop_counter += 1 t0 = timer() r = 0 for read_info, read_id, seq_len, results in mapper.map_reads_2( caller.basecall_minknow( reads=client.get_read_chunks(batch_size=batch_size, last=True), signal_dtype=client.signal_dtype, prev_signal=previous_signal, decided_reads=decided_reads, )): r += 1 read_start_time = timer() channel, read_number = read_info if read_number not in tracker[channel]: tracker[channel].clear() tracker[channel][read_number] += 1 mode = "" exceeded_threshold = False below_threshold = False log_decision = lambda: cl.debug( l_string.format( loop_counter, r, read_id, channel, read_number, seq_len, tracker[channel][read_number], mode, getattr(conditions[run_info[channel]], mode, mode), conditions[run_info[channel]].name, below_threshold, exceeded_threshold, read_start_time, timer(), time.time(), )) # Control channels if conditions[run_info[channel]].control: mode = "control" log_decision() client.stop_receiving_read(channel, read_number) continue # This is an analysis channel # Below minimum chunks if tracker[channel][read_number] <= conditions[ run_info[channel]].min_chunks: below_threshold = True # Greater than or equal to maximum chunks if tracker[channel][read_number] >= conditions[ run_info[channel]].max_chunks: exceeded_threshold = True # No mappings if not results: mode = "no_map" hits = set() for result in results: pf.debug("{}\t{}\t{}".format(read_id, seq_len, result)) hits.add(result.ctg) if hits & conditions[run_info[channel]].targets: # Mappings and targets overlap coord_match = any( between(r.r_st, c) for r in results for c in conditions[run_info[channel]].coords.get( strand_converter.get(r.strand), {}).get(r.ctg, [])) if len(hits) == 1: if coord_match: # Single match that is within coordinate range mode = "single_on" else: # Single match to a target outside coordinate range mode = "single_off" elif len(hits) > 1: if coord_match: # Multiple matches with at least one in the correct region mode = "multi_on" else: # Multiple matches to targets outside the coordinate range mode = "multi_off" else: # No matches in mappings if len(hits) > 1: # More than one, off-target, mapping mode = "multi_off" elif len(hits) == 1: # Single off-target mapping mode = "single_off" # This is where we make our decision: # Get the associated action for this condition decision_str = getattr(conditions[run_info[channel]], mode) # decision is an alias for the functions "unblock" or "stop_receiving" decision = decision_dict[decision_str] # If max_chunks has been exceeded AND we don't want to keep sequencing we unblock if exceeded_threshold and decision_str != "stop_receiving": mode = "exceeded_max_chunks_unblocked" client.unblock_read(channel, read_number, unblock_duration, read_id) # TODO: WHAT IS GOING ON?! # I think that this needs to change between enrichment and depletion # If under min_chunks AND any mapping mode seen we unblock # if below_threshold and mode in {"single_off", "multi_off"}: if below_threshold and mode in { "single_on", "single_off", "multi_on", "multi_off", }: mode = "below_min_chunks_unblocked" client.unblock_read(channel, read_number, unblock_duration, read_id) # proceed returns None, so we send no decision; otherwise unblock or stop_receiving elif decision is not None: decided_reads[channel] = read_id decision(channel, read_number) log_decision() t1 = timer() if r > 0: s1 = "{}R/{:.5f}s" logger.info(s1.format(r, t1 - t0)) # limit the rate at which we make requests if t0 + throttle > t1: time.sleep(throttle + t0 - t1) else: send_message(client.connection, "Read Until Client Stopped.", Severity.WARN) caller.disconnect() logger.info("Finished analysis of reads as client stopped.")
def main(): extra_args = ( ( "--toml", dict( metavar="TOML", required=True, help="TOML file specifying experimental parameters", ), ), ("--paf-log", dict( help="PAF log", default="paflog.log", )), ("--chunk-log", dict( help="Chunk log", default="chunk_log.log", )), ) parser, args = get_parser(extra_args=extra_args, file=__file__) # set up logging to file for DEBUG messages and above logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(name)s %(message)s", filename=args.log_file, filemode="w", ) # define a Handler that writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter(args.log_format) console.setFormatter(formatter) # add the handler to the root logger logging.getLogger("").addHandler(console) # Start by logging sys.argv and the parameters used logger = logging.getLogger("Manager") logger.info(" ".join(sys.argv)) print_args(args, logger=logger) # Setup chunk and paf logs chunk_logger = setup_logger("DEC", log_file=args.chunk_log) paf_logger = setup_logger("PAF", log_file=args.paf_log) # Parse configuration TOML # TODO: num_channels is not configurable here, should be inferred from client run_info, conditions, reference, caller_kwargs = get_run_info( args.toml, num_channels=512) live_toml = Path("{}_live".format(args.toml)) # Load Minimap2 index logger.info("Initialising minimap2 mapper") mapper = CustomMapper(reference) logger.info("Mapper initialised") read_until_client = read_until.ReadUntilClient( mk_host=args.host, mk_port=args.port, device=args.device, # one_chunk=args.one_chunk, filter_strands=True, # TODO: test cache_type by passing a function here cache_type=args.read_cache, cache_size=args.cache_size, ) send_message( read_until_client.connection, "Read Until is controlling sequencing on this device. You use it at your own risk.", Severity.WARN, ) for message, sev in describe_experiment(conditions, mapper): logger.info(message) send_message( read_until_client.connection, message, sev, ) """ This experiment has N regions on the flowcell. using reference: /path/to/ref.mmi Region i:NAME (control=bool) has X targets of which Y are found in the reference. reads will be unblocked when [u,v], sequenced when [w,x] and polled for more data when [y,z]. """ # FIXME: currently flowcell size is not included, this should be pulled from # the read_until_client analysis_worker = functools.partial( simple_analysis, read_until_client, unblock_duration=args.unblock_duration, throttle=args.throttle, batch_size=args.batch_size, cl=chunk_logger, pf=paf_logger, live_toml_path=live_toml, dry_run=args.dry_run, run_info=run_info, conditions=conditions, mapper=mapper, caller_kwargs=caller_kwargs, ) results = run_workflow( read_until_client, analysis_worker, args.workers, args.run_time, runner_kwargs={ # "min_chunk_size": args.min_chunk_size, "first_channel": min(args.channels), "last_channel": max(args.channels), }, ) # No results returned send_message( read_until_client.connection, "Read Until is disconnected from this device. Sequencing will proceed normally.", Severity.WARN, )
def main(): extra_args = ( ( "--toml", dict( metavar="TOML", required=True, help="TOML file specifying experimental parameters", ), ), ("--paf-log", dict( help="PAF log", default="paflog.log", )), ("--chunk-log", dict( help="Chunk log", default="chunk_log.log", )), ) parser, args = get_parser(extra_args=extra_args, file=__file__) # TODO: Move logging config to separate configuration file # set up logging to file for DEBUG messages and above logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(name)s %(message)s", filename=args.log_file, filemode="w", ) # define a Handler that writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter(args.log_format) console.setFormatter(formatter) # add the handler to the root logger logging.getLogger("").addHandler(console) # Start by logging sys.argv and the parameters used logger = logging.getLogger("Manager") logger.info(" ".join(sys.argv)) print_args(args, logger=logger) read_until_client = read_until.ReadUntilClient( mk_host=args.host, mk_port=args.port, device=args.device, # one_chunk=args.one_chunk, filter_strands=True, # TODO: test cache_type by passing a function here cache_type=args.read_cache, cache_size=args.cache_size, ) send_message( read_until_client.connection, "Read Until is controlling sequencing on this device. You use it at your own risk.", Severity.WARN, ) # FIXME: currently flowcell size is not included, this should be pulled from # the read_until_client analysis_worker = functools.partial( simple_analysis, read_until_client, unblock_duration=args.unblock_duration, throttle=args.throttle, batch_size=args.batch_size, chunk_log=args.chunk_log, paf_log=args.paf_log, toml_path=args.toml, dry_run=args.dry_run, ) results = run_workflow( read_until_client, analysis_worker, args.workers, args.run_time, runner_kwargs={ # "min_chunk_size": args.min_chunk_size, "first_channel": min(args.channels), "last_channel": max(args.channels), }, ) # No results returned send_message( read_until_client.connection, "Read Until is disconnected from this device. Sequencing will proceed normally.", Severity.WARN, )