Example #1
0
    def processfiles(self):
        self.logger.info("Process Files Inititated")
        self.counter = 0
        self.targets = []
        self.masterdf = pd.DataFrame(columns=['seqid', 'position', 'coverage'])

        while self.running:
            currenttime = time.time()
            #for fastqfile, createtime in tqdm(sorted(self.creates.items(), key=lambda x: x[1])):
            fastqfilelist = list()
            for fastqfile, createtime in sorted(self.creates.items(),
                                                key=lambda x: x[1]):

                delaytime = 0

                # file created 5 sec ago, so should be complete. For simulations we make the time longer.
                if (int(createtime) + delaytime < time.time()):
                    self.logger.info(fastqfile)
                    del self.creates[fastqfile]
                    self.counter += 1
                    fastqfilelist.append(fastqfile)

                    #print (fastqfile,md5Checksum(fastqfile), "\n\n\n\n")
            targets, self.masterdf = parse_fastq_file(fastqfilelist, self.args,
                                                      logging, self.masterdf)
            print(targets)
            print(self.targets)
            if len(targets) > len(self.targets):
                updated_targets = set(targets) - set(self.targets)
                update_message = "Updating targets with {}".format(
                    nice_join(updated_targets, conjunction="and"))
                self.logger.info(update_message)
                if not self.args.simulation:
                    send_message(self.connection, update_message,
                                 Severity.WARN)
                write_new_toml(self.args, targets)
                self.targets = []
                self.targets = targets.copy()

            if self.masterdf.shape[0] > 0 and self.masterdf.shape[0] == len(
                    self.targets):
                # Every target is covered at the desired coverage level.
                self.logger.info(
                    "Every target is covered at at least {}x".format(
                        self.args.depth))
                if not self.args.simulation:
                    self.connection.protocol.stop_protocol()
                    send_message(
                        self.connection,
                        "Iter Align has stopped the run as all targets should be covered by at least {}x"
                        .format(self.args.depth),
                        Severity.WARN,
                    )

            #parse_fastq_file(fastqfile, self.rundict, self.fastqdict, self.args, self.header, self.MinotourConnection)

            #self.args.files_processed += 1

            if currenttime + 5 > time.time():
                time.sleep(5)
Example #2
0
def simple_analysis(client,
                    batch_size=512,
                    throttle=0.1,
                    unblock_duration=0.1):
    """Analysis function

    Parameters
    ----------
    client : read_until.ReadUntilClient
        An instance of the ReadUntilClient object
    batch_size : int
        The number of reads to be retrieved from the ReadUntilClient at a time
    throttle : int or float
        The number of seconds interval between requests to the ReadUntilClient
    unblock_duration : int or float
        Time, in seconds, to apply unblock voltage

    Returns
    -------
    None
    """
    logger = logging.getLogger(__name__)
    send_message(
        client.connection,
        "Read Until sending Unblock All Messages. All reads will be prematurely truncated. This will affect a live sequencing run.",
        Severity.WARN)
    while client.is_running:

        r = 0
        t0 = timer()

        for r, (channel, read) in enumerate(
                client.get_read_chunks(
                    batch_size=batch_size,
                    last=True,
                ),
                start=1,
        ):
            # pass
            client.unblock_read(channel,
                                read.number,
                                read_id=read.id,
                                duration=unblock_duration)
            client.stop_receiving_read(channel, read.number)

        t1 = timer()
        if r:
            logger.info("Took {:.6f} for {} reads".format(t1 - t0, r))
        # limit the rate at which we make requests
        if t0 + throttle > t1:
            time.sleep(throttle + t0 - t1)
    else:
        send_message(client.connection, "Read Until Unblock All Disconnected.",
                     Severity.WARN)
        logger.info("Finished analysis of reads as client stopped.")
Example #3
0
def run(parser, args):
    args.tomlfile = args.toml
    args.toml = toml.load(args.toml)
    print(args)

    # TODO: Move logging config to separate configuration file
    # set up logging to file
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(levelname)s::%(asctime)s::%(name)s::%(message)s',
        filename=args.log_file,
        filemode='w')

    # define a Handler that writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)

    # set a format which is simpler for console use
    formatter = logging.Formatter('%(name)-15s: %(levelname)-8s %(message)s')
    console.setFormatter(formatter)

    # add the handler to the root logger
    logging.getLogger('').addHandler(console)

    # Start by logging sys.argv and the parameters used
    logger = logging.getLogger("Manager")
    logger.info(" ".join(sys.argv))
    print_args(args, logger=logger)

    logger.info("Initialising iterAlign.")

    logger.info("Setting up FastQ monitoring.")

    #### Check if a run is active - if not, wait.

    args.simulation = True
    connection = None
    if args.watch is None:
        args.simulation = False
        logger.info("Creating rpc connection for device {}.".format(
            args.device))
        try:
            connection, messageport = get_rpc_connection(args.device)
        except ValueError as e:
            print(e)
            sys.exit(1)

        send_message(connection, "Iteralign Connected to MinKNOW",
                     Severity.WARN)

        logger.info("Loaded RPC")
        while parse_message(connection.acquisition.current_status()
                            )['status'] != "PROCESSING":
            time.sleep(1)
        #### Check if we know where data is being written to , if not... wait
        args.watch = parse_message(connection.acquisition.get_acquisition_info(
        ))['config_summary']['reads_directory']

    else:
        messageport = ""

    event_handler = FastqHandler(args, logging, messageport, connection)
    # This block handles the fastq
    observer = Observer()
    observer.schedule(event_handler, path=args.watch, recursive=True)
    observer.daemon = True

    try:

        observer.start()
        logger.info("FastQ Monitoring Running.")
        while 1:
            time.sleep(1)

    except KeyboardInterrupt:

        logger.info("Exiting - Will take a few seconds to clean up!")

        observer.stop()
        observer.join()

        os._exit(0)
def run(parser, args):
    # new code block: change the reference path within the args.toml file into the args.mindex path
    d = toml.load(args.toml)

    print(d["conditions"]["reference"])
    args.tomlfile = args.toml
    args.toml = toml.load(args.toml)
    print(args)

    # TODO: Move logging config to separate configuration file
    # set up logging to file
    logging.basicConfig(level=logging.DEBUG,
                        format='%(levelname)s::%(asctime)s::%(name)s::%(message)s',
                        filename=args.log_file,
                        filemode='w')

    # define a Handler that writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)

    # set a format which is simpler for console use
    formatter = logging.Formatter('%(name)-15s: %(levelname)-8s %(message)s')
    console.setFormatter(formatter)

    # add the handler to the root logger
    logging.getLogger('').addHandler(console)

    # Start by logging sys.argv and the parameters used
    logger = logging.getLogger("Manager")
    logger.info(" ".join(sys.argv))
    print_args(args, logger=logger)

    logger.info("Initialising iterAlign.")

    logger.info("Setting up FastQ monitoring.")

    #### Check if a run is active - if not, wait.

    args.simulation = True
    connection = None

    #set default message severity level.
    severity = 2

    if args.watch is None:
        args.simulation = False
        logger.info("Creating rpc connection for device {}.".format(args.device))
        try:
            connection, messageport = get_rpc_connection(args.device)
        except ValueError as e:
            print(e)
            sys.exit(1)

        #send_message_port("Iteralign Connected to MinKNOW", args.host, messageport)
        send_message(connection, "Iteralign Connected to MinKNOW.", Severity.WARN)

        logger.info("Loaded RPC")
        while parse_message(connection.acquisition.current_status())['status'] != "PROCESSING":
            time.sleep(1)
        ### Check if we know where data is being written to , if not... wait
        args.watch = parse_message(connection.acquisition.get_acquisition_info())['config_summary'][
            'reads_directory']

    else:
        messageport = ""

    event_handler = FastqHandler(args, logging, messageport, connection)
    # This block handles the fastq
    observer = Observer()
    observer.schedule(event_handler, path=args.watch, recursive=True)
    observer.daemon = True

    try:

        observer.start()
        logger.info("FastQ Monitoring Running.")
        while 1:
            time.sleep(1)

    except KeyboardInterrupt:

        logger.info("Exiting - Will take a few seconds to clean up!")

        observer.stop()
        observer.join()

        if args.keepfiles:
            logging.info("The 'keepfiles' argument was set, files generated by classifier have been retained")
        else:
            if os.path.isdir(args.path):
                for path, dirs, files in os.walk(args.path):
                    for f in files:
                        if f.startswith(args.prefix):
                            os.unlink(f)
                            logging.info("file removed: {}".format(f))

            if os.path.isdir("./"):
                for path, dirs, files in os.walk("./"):
                    for f in files:
                        if f.endswith(args.creport):
                            os.unlink(f)
                            logging.info("file removed: {}".format(f))

        logging.info("All files generated by classifier have been removed.")

        os._exit(0)
    def processfiles(self):
        self.logger.info("Process Files Inititated")
        self.counter = 1
        self.targets = []
        self.masterdf = pd.DataFrame(columns=['seqid', 'position', 'coverage'])
        self.taxid_entries = 0
        self.downloaded_set = set()
        self.length_dict = {}
        self.coverage_sum = {}

        if self.args.references:
            logging.info("References argument provided. Will download references genomes.")
            self.downloaded_set = set(self.args.references)
            logging.info(self.downloaded_set)
            self.url_list = url_list_generation(self.args, self.args.references)
            self.length_dict.update(download_references(self.args, self.url_list, self.downloaded_set))
            generate_mmi(self.args, self.counter)


        while self.running:
            currenttime = time.time()
            # for fastqfile, createtime in tqdm(sorted(self.creates.items(), key=lambda x: x[1])):
            fastqfilelist = list()
            for fastqfile, createtime in sorted(self.creates.items(), key=lambda x: x[1]):

                delaytime = 0

                # file created 5 sec ago, so should be complete. For simulations we make the time longer.
                if (int(createtime) + delaytime < time.time()):
                    self.logger.info(fastqfile)
                    del self.creates[fastqfile]
                    self.counter += 1
                    fastqfilelist.append(fastqfile)

                    # print (fastqfile,md5Checksum(fastqfile), "\n\n\n\n")
            # as long as there are files within the args.watch directory to parse
            if fastqfilelist:
                print(self.downloaded_set)
                targets, self.downloaded_set, self.taxid_entries, self.coverage_sum = parse_fastq_file(fastqfilelist, self.args, logging, self.length_dict, self.downloaded_set, self.taxid_entries, self.coverage_sum, self.connection)
                print(targets)
                print(self.targets)

                if len(targets) > len(self.targets):
                    updated_targets = set(targets) - set(self.targets)
                    update_message = "Updating targets with {}".format(nice_join(updated_targets, conjunction="and"))
                    self.logger.info(update_message)
                    if not self.args.simulation:
                        #send_message_port(update_message, self.args.host, self.messageport)
                        send_message(self.connection, update_message, Severity.WARN)
                    write_new_toml(self.args, targets)
                    self.targets = []
                    self.targets = targets.copy()

                if self.masterdf.shape[0] > 0 and self.masterdf.shape[0] == len(self.targets):
                    # Every target is covered at the desired coverage level.
                    self.logger.info("Every target is covered at at least {}x".format(self.args.depth))
                    if not self.args.simulation:
                        self.connection.protocol.stop_protocol()
                        #send_message_port(
                        #    "Iter Align has stopped the run as all targets should be covered by at least {}x".format(
                        #        self.args.depth), self.args.host, self.messageport)
                        send_message(self.connection, "Iter Align has stopped the run as all targets should be covered by at least {}x".format(
                                self.args.depth), Severity.WARN)

                # parse_fastq_file(fastqfile, self.rundict, self.fastqdict, self.args, self.header, self.MinotourConnection)

                # self.args.files_processed += 1

                if currenttime + 5 > time.time():
                    time.sleep(5)
def parse_fastq_file(fastqfileList, args, logging, length_dict, taxID_set, counter, coverage_sum, connection):
    logger = logging.getLogger("ParseFastq")
    logger.info(fastqfileList)
    logger.info(args.toml['conditions']['reference'])
    with open(os.devnull, 'w') as devnull:

        # convert the 'fastqfileList' into a string valid for the list of fastq files to be read by centrifuge
        fastq_str = ",".join(fastqfileList)

        # centrifuge command to classify reads in the fastq files found by watchdog
        centrifuge_cmd = "centrifuge -p {} -x {} -q {}".format(args.threads,
                                                               args.cindex,
                                                               fastq_str
                                                               )

        # show what the centrifuge command in the terminal
        logging.info(centrifuge_cmd)

        # start time of centrifuge to track the time centrifuge requires to classify reads
        centrifuge_start_time = time.time()

        # subprocess for 'centrifuge_cmd'
        proc = subprocess.Popen(
            centrifuge_cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            stdin=subprocess.PIPE,
            shell=True,
            # Aliased by `text=True` in 3.7
            universal_newlines=True,
        )
        out, err = proc.communicate()
        proc.stdout.close()

        # confirm that the centrifuge pipeline has finished and show the time of execusion
        logging.info("Post centrifuge run: {} seconds".format(time.time()-centrifuge_start_time))

        output_fields = ["readID", "seqID", "taxID", "hitLength", "queryLength", "numMatches"]

        # create the DataFrame from the centrifuge output using 'output_fields' as the column headers
        out_df = pd.read_csv(StringIO(out),
                             sep="\t",
                             usecols=output_fields
                             )

        # create a dataframe from the args.creport 'centrifuge_report.tsv'
        report_df = pd.read_csv(args.creport,
                                sep="\t",
                                usecols=["name", "genomeSize", "taxID"],
                                )

        # merge both dataframes together
        new_df = pd.merge(out_df, report_df, on="taxID")

        # only reads that uniquely align to a species will be used, with reads that numMatches != 1 being filtered
        reject_df = new_df[new_df.numMatches != 1]
        reject_df["reason"] = "Multiple reference genome alignments"
        # reject_df.loc[:, "reason"] = "Multiple reference genome alignments"

        # genomeSize == 0 infer a read classification above the species taxon and are therefore removed
        intermediate_df = new_df[new_df.genomeSize == 0]
        intermediate_df = intermediate_df[intermediate_df.numMatches == 1]
        intermediate_df["reason"] = "Read aligns to non-species taxon"

        # log the reads that were removed into a seperate file
        reject_df = reject_df.append(intermediate_df)
        if os.path.isfile(args.path + args.prefix + args.reject):
            reject_df.to_csv(args.path + args.prefix + args.reject,
                             sep="\t",
                             mode="a",
                             index=False,
                             header=None,
                             )
        else:
            reject_df.to_csv(args.path + args.prefix + args.reject,
                             sep="\t",
                             mode="a",
                             index=False,
                             header=True,
                             )

        # reads that uniquely align to one species are logged into a file
        new_df = new_df[new_df.numMatches == 1]
        if os.path.isfile(args.path + args.prefix + args.readfile):
            new_df.to_csv(args.path + args.prefix + args.readfile,
                          sep="\t",
                          mode="a",
                          index=None,
                          header=None,
                          )
        else:
            new_df.to_csv(args.path + args.prefix + args.readfile,
                          sep="\t",
                          mode="a",
                          index=None,
                          header=True,
                          )
        logging.info("reject file made")

        # read in all valid reads back into memory from all current iterations
        all_reads_df = pd.read_csv(args.path + args.prefix + args.readfile,
                                   sep="\t"
                                   )

        # count the number of reads that were clasified to all taxIDs found within 'all_reads_df' and only keep the taxIDs with a count above args.threshold
        taxid_count = all_reads_df.groupby("taxID").agg({"taxID": "count"})
        taxid_result_set = set(taxid_count[taxid_count["taxID"].ge(args.threshold)].index.values.tolist())
        # call in the taxIDs that already have a reference genome in the mmi
        downloaded_set = taxID_set

        # if taxIDs were found above args.threshold
        if taxid_result_set:
            logging.info("taxIDs found above specified threshold")
            # locate new taxIDs that were not found in previous iterations
            difference_set = taxid_result_set - downloaded_set
            downloaded_set |= taxid_result_set

            # if novel taxIDs were found
            if difference_set:
                with open(args.path + args.prefix + args.tidfile, "w+") as f:
                    d = toml.load(args.path + args.prefix + args.tidfile)
                    if os.path.isfile(args.tidfile):
                        d["taxid"] = {"iteration.{}".format(counter): difference_set}
                    else:
                        d = {"taxid": {"iteration.{}".format(counter): difference_set}}
                    toml.dump(d, f)
                f.close()

                logging.info("new taxids found: {}".format(difference_set))
                logging.info("Downloading reference genomes")

                url_list = url_list_generation(args, difference_set)

                # download the reference genomes into a single file

                length_dict.update(download_references(args, url_list, difference_set))

                logging.info("Generating mmi")

                generate_mmi(args, counter)

                update_message = "Updated the minimap MMI to {}".format(args.toml['conditions']['reference'])
                logging.info(update_message)
                if not args.simulation:
                    #send_message_port(update_message, args.host, messageport)
                    send_message(connection, update_message, Severity.WARN)


            else:
                # show in terminal that no new taxIDs were found
                logging.info("No new taxIDs were identified")
        else:
            logging.info("No taxIDs found above threshold.")
        # all of the new code has been parsed, the rest follows standard 'iteralign.py' script
        logging.info("new code parsing completed\n\ncommencing iteralign")

        minimapcmd = ["minimap2","-ax","map-ont","-t {}".format(args.threads),args.toml['conditions']['reference']] #" ".join(fastqfilelist)]
        minimapcmd.extend(fastqfileList)
        logging.info(" ".join(minimapcmd))
        minimapoutput = subprocess.Popen(minimapcmd, stdout=subprocess.PIPE,stderr=devnull)

        samcmd = ["samtools","view", "-bS"]
        samoutput = subprocess.Popen(samcmd, stdin=minimapoutput.stdout, stdout=subprocess.PIPE, stderr=devnull)
        #samsortcmd = ["samtools", "sort", "-@2", "-o", "sortedbam.bam"]
        samsortcmd = ["samtools", "sort", "-@{}".format(args.threads)]
        samsortoutput = subprocess.Popen(samsortcmd, stdin=samoutput.stdout, stdout=subprocess.PIPE, stderr=devnull)
        samdepthcmd = ["samtools", "depth", "/dev/stdin"]
        samdepthoutput = subprocess.Popen(samdepthcmd, stdin=samsortoutput.stdout,stdout=subprocess.PIPE, stderr=devnull, universal_newlines=True)
        minimapoutput.stdout.close()
        samoutput.stdout.close()
        samsortoutput.stdout.close()

        iter_depth = (l.strip().split("\t") for l in samdepthoutput.stdout)
        parse_iter = ((x[0], int(x[-1])) for x in iter_depth)

        d = defaultdict(int)
        d.update(coverage_sum)
        for name, depth in parse_iter:
            d[name] += depth

        depth_dict = {k: d[k]/length_dict[k] for k in length_dict.keys() & d}

        with open(args.path + args.prefix + args.coveragefile, "a") as fh:
            for k, v in depth_dict.items():
                fh.write("{}\t{}\t{}\n".format(counter, k, v))

        targets = [k for k, v in depth_dict.items() if v > args.depth]

        logging.info(targets)

        counter += 1

        logging.info("Finished processing {}.".format(" ".join(fastqfileList)))

    return targets, downloaded_set, counter, d
Example #7
0
def simple_analysis(
    client,
    batch_size=512,
    throttle=0.1,
    unblock_duration=0.5,
    cl=None,
    pf=None,
    live_toml_path=None,
    flowcell_size=512,
    dry_run=False,
    run_info=None,
    conditions=None,
    mapper=None,
    caller_kwargs=None,
):
    """Analysis function

    Parameters
    ----------
    client : read_until.ReadUntilClient
        An instance of the ReadUntilClient object
    batch_size : int
        The number of reads to be retrieved from the ReadUntilClient at a time
    throttle : int or float
        The number of seconds interval between requests to the ReadUntilClient
    unblock_duration : int or float
        Time, in seconds, to apply unblock voltage
    cl : logging.Logger
        Log file to log chunk data to
    pf : logging.Logger
        Log file to log alignments to
    live_toml_path : str
        Path to a `live` TOML configuration file for read until. If this exists when
        the run starts it will be deleted
    flowcell_size : int
        The number of channels on the flowcell, 512 for MinION and 3000 for PromethION
    dry_run : bool
        If True unblocks are replaced with `stop_receiving` commands
    run_info : dict
        Dictionary of {channel: index} where index corresponds to an index in `conditions`
    conditions : list
        Experimental conditions as List of namedtuples.
    mapper : mappy.Aligner
    caller_kwargs : dict

    Returns
    -------
    None
    """
    # Init logger for this function
    logger = logging.getLogger(__name__)

    # Delete live TOML file if it exists
    live_toml_path = Path(live_toml_path)
    if live_toml_path.is_file():
        live_toml_path.unlink()

    # TODO: test this
    # Write channels.toml
    d = {
        "conditions": {
            str(v): {
                "channels": [],
                "name": conditions[v].name
            }
            for k, v in run_info.items()
        }
    }
    for k, v in run_info.items():
        d["conditions"][str(v)]["channels"].append(k)

    channels_out = str(client.mk_run_dir / "channels.toml")
    with open(channels_out, "w") as fh:
        fh.write(
            "# This file is written as a record of the condition each channel is assigned.\n"
        )
        fh.write(
            "# It may be changed or overwritten if you restart Read Until.\n")
        fh.write("# In the future this file may become a CSV file.\n")
        toml.dump(d, fh)

    caller = Caller(**caller_kwargs)
    # What if there is no reference or an empty MMI

    # DefaultDict[int: collections.deque[Tuple[str, ndarray]]]
    #  tuple is (read_id, previous_signal)
    # TODO: tuple should use read_number instead
    previous_signal = defaultdict(functools.partial(deque, maxlen=1))
    # count how often a read is seen
    tracker = defaultdict(Counter)
    # decided
    decided_reads = {}
    strand_converter = {1: "+", -1: "-"}

    read_id = ""

    # TODO: partial-ise / lambda unblock to take the unblock duration
    if dry_run:
        decision_dict = {
            "stop_receiving": client.stop_receiving_read,
            "proceed": None,
            "unblock": client.stop_receiving_read,
        }
        send_message(client.connection,
                     "This is a test run. No unblocks will occur.",
                     Severity.WARN)
    else:
        decision_dict = {
            "stop_receiving":
            client.stop_receiving_read,
            "proceed":
            None,
            "unblock":
            lambda c, n: client.unblock_read(c, n, unblock_duration, read_id),
        }
        send_message(client.connection,
                     "This is a live run. Unblocks will occur.", Severity.WARN)
    decision_str = ""
    below_threshold = False
    exceeded_threshold = False

    l_string = (
        "client_iteration",
        "read_in_loop",
        "read_id",
        "channel",
        "read_number",
        "seq_len",
        "counter",
        "mode",
        "decision",
        "condition",
        "min_threshold",
        "count_threshold",
        "start_analysis",
        "end_analysis",
        "timestamp",
    )
    cl.debug("\t".join(l_string))
    l_string = "\t".join(("{}" for _ in l_string))
    loop_counter = 0
    while client.is_running:
        if live_toml_path.is_file():
            # Reload the TOML config from the *_live file
            run_info, conditions, new_reference, _ = get_run_info(
                live_toml_path, flowcell_size)

            # Check the reference path if different from the loaded mapper
            if new_reference != mapper.index:
                old_reference = mapper.index
                # Log to file and MinKNOW interface
                logger.info("Reloading mapper")
                send_message(client.connection,
                             "Reloading mapper. Read Until paused.",
                             Severity.INFO)

                # Update mapper client.
                mapper = CustomMapper(new_reference)
                # Log on success
                logger.info("Reloaded mapper")

                # If we've reloaded a reference, delete the previous one
                if old_reference:
                    logger.info("Deleting old mmi {}".format(old_reference))
                    # We now delete the old mmi file.
                    Path(old_reference).unlink()
                    logger.info("Old mmi deleted.")

        # TODO: Fix the logging to just one of the two in use

        if not mapper.initialised:
            time.sleep(throttle)
            continue

        loop_counter += 1
        t0 = timer()
        r = 0

        for read_info, read_id, seq_len, results in mapper.map_reads_2(
                caller.basecall_minknow(
                    reads=client.get_read_chunks(batch_size=batch_size,
                                                 last=True),
                    signal_dtype=client.signal_dtype,
                    prev_signal=previous_signal,
                    decided_reads=decided_reads,
                )):
            r += 1
            read_start_time = timer()
            channel, read_number = read_info
            if read_number not in tracker[channel]:
                tracker[channel].clear()
            tracker[channel][read_number] += 1

            mode = ""
            exceeded_threshold = False
            below_threshold = False

            log_decision = lambda: cl.debug(
                l_string.format(
                    loop_counter,
                    r,
                    read_id,
                    channel,
                    read_number,
                    seq_len,
                    tracker[channel][read_number],
                    mode,
                    getattr(conditions[run_info[channel]], mode, mode),
                    conditions[run_info[channel]].name,
                    below_threshold,
                    exceeded_threshold,
                    read_start_time,
                    timer(),
                    time.time(),
                ))

            # Control channels
            if conditions[run_info[channel]].control:
                mode = "control"
                log_decision()
                client.stop_receiving_read(channel, read_number)
                continue

            # This is an analysis channel
            # Below minimum chunks
            if tracker[channel][read_number] <= conditions[
                    run_info[channel]].min_chunks:
                below_threshold = True

            # Greater than or equal to maximum chunks
            if tracker[channel][read_number] >= conditions[
                    run_info[channel]].max_chunks:
                exceeded_threshold = True

            # No mappings
            if not results:
                mode = "no_map"

            hits = set()
            for result in results:
                pf.debug("{}\t{}\t{}".format(read_id, seq_len, result))
                hits.add(result.ctg)

            if hits & conditions[run_info[channel]].targets:
                # Mappings and targets overlap
                coord_match = any(
                    between(r.r_st, c) for r in results
                    for c in conditions[run_info[channel]].coords.get(
                        strand_converter.get(r.strand), {}).get(r.ctg, []))
                if len(hits) == 1:
                    if coord_match:
                        # Single match that is within coordinate range
                        mode = "single_on"
                    else:
                        # Single match to a target outside coordinate range
                        mode = "single_off"
                elif len(hits) > 1:
                    if coord_match:
                        # Multiple matches with at least one in the correct region
                        mode = "multi_on"
                    else:
                        # Multiple matches to targets outside the coordinate range
                        mode = "multi_off"

            else:
                # No matches in mappings
                if len(hits) > 1:
                    # More than one, off-target, mapping
                    mode = "multi_off"
                elif len(hits) == 1:
                    # Single off-target mapping
                    mode = "single_off"

            # This is where we make our decision:
            # Get the associated action for this condition
            decision_str = getattr(conditions[run_info[channel]], mode)
            # decision is an alias for the functions "unblock" or "stop_receiving"
            decision = decision_dict[decision_str]

            # If max_chunks has been exceeded AND we don't want to keep sequencing we unblock
            if exceeded_threshold and decision_str != "stop_receiving":
                mode = "exceeded_max_chunks_unblocked"
                client.unblock_read(channel, read_number, unblock_duration,
                                    read_id)

            # TODO: WHAT IS GOING ON?!
            #  I think that this needs to change between enrichment and depletion
            # If under min_chunks AND any mapping mode seen we unblock
            # if below_threshold and mode in {"single_off", "multi_off"}:
            if below_threshold and mode in {
                    "single_on",
                    "single_off",
                    "multi_on",
                    "multi_off",
            }:
                mode = "below_min_chunks_unblocked"
                client.unblock_read(channel, read_number, unblock_duration,
                                    read_id)

            # proceed returns None, so we send no decision; otherwise unblock or stop_receiving
            elif decision is not None:
                decided_reads[channel] = read_id
                decision(channel, read_number)

            log_decision()

        t1 = timer()
        if r > 0:
            s1 = "{}R/{:.5f}s"
            logger.info(s1.format(r, t1 - t0))
        # limit the rate at which we make requests
        if t0 + throttle > t1:
            time.sleep(throttle + t0 - t1)
    else:
        send_message(client.connection, "Read Until Client Stopped.",
                     Severity.WARN)
        caller.disconnect()
        logger.info("Finished analysis of reads as client stopped.")
Example #8
0
def main():
    extra_args = (
        (
            "--toml",
            dict(
                metavar="TOML",
                required=True,
                help="TOML file specifying experimental parameters",
            ),
        ),
        ("--paf-log", dict(
            help="PAF log",
            default="paflog.log",
        )),
        ("--chunk-log", dict(
            help="Chunk log",
            default="chunk_log.log",
        )),
    )
    parser, args = get_parser(extra_args=extra_args, file=__file__)

    # set up logging to file for DEBUG messages and above
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s %(name)s %(message)s",
        filename=args.log_file,
        filemode="w",
    )

    # define a Handler that writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)

    # set a format which is simpler for console use
    formatter = logging.Formatter(args.log_format)
    console.setFormatter(formatter)

    # add the handler to the root logger
    logging.getLogger("").addHandler(console)

    # Start by logging sys.argv and the parameters used
    logger = logging.getLogger("Manager")
    logger.info(" ".join(sys.argv))
    print_args(args, logger=logger)

    # Setup chunk and paf logs
    chunk_logger = setup_logger("DEC", log_file=args.chunk_log)
    paf_logger = setup_logger("PAF", log_file=args.paf_log)

    # Parse configuration TOML
    # TODO: num_channels is not configurable here, should be inferred from client
    run_info, conditions, reference, caller_kwargs = get_run_info(
        args.toml, num_channels=512)
    live_toml = Path("{}_live".format(args.toml))

    # Load Minimap2 index
    logger.info("Initialising minimap2 mapper")
    mapper = CustomMapper(reference)
    logger.info("Mapper initialised")

    read_until_client = read_until.ReadUntilClient(
        mk_host=args.host,
        mk_port=args.port,
        device=args.device,
        # one_chunk=args.one_chunk,
        filter_strands=True,
        # TODO: test cache_type by passing a function here
        cache_type=args.read_cache,
        cache_size=args.cache_size,
    )

    send_message(
        read_until_client.connection,
        "Read Until is controlling sequencing on this device. You use it at your own risk.",
        Severity.WARN,
    )

    for message, sev in describe_experiment(conditions, mapper):
        logger.info(message)

        send_message(
            read_until_client.connection,
            message,
            sev,
        )
    """
    This experiment has N regions on the flowcell.

    using reference: /path/to/ref.mmi

    Region i:NAME (control=bool) has X targets of which Y are found in the reference.
    reads will be unblocked when [u,v], sequenced when [w,x] and polled for more data when [y,z].
    """

    # FIXME: currently flowcell size is not included, this should be pulled from
    #  the read_until_client
    analysis_worker = functools.partial(
        simple_analysis,
        read_until_client,
        unblock_duration=args.unblock_duration,
        throttle=args.throttle,
        batch_size=args.batch_size,
        cl=chunk_logger,
        pf=paf_logger,
        live_toml_path=live_toml,
        dry_run=args.dry_run,
        run_info=run_info,
        conditions=conditions,
        mapper=mapper,
        caller_kwargs=caller_kwargs,
    )

    results = run_workflow(
        read_until_client,
        analysis_worker,
        args.workers,
        args.run_time,
        runner_kwargs={
            # "min_chunk_size": args.min_chunk_size,
            "first_channel": min(args.channels),
            "last_channel": max(args.channels),
        },
    )

    # No results returned
    send_message(
        read_until_client.connection,
        "Read Until is disconnected from this device. Sequencing will proceed normally.",
        Severity.WARN,
    )
Example #9
0
File: ru_gen.py Project: svennd/ru
def main():
    extra_args = (
        (
            "--toml",
            dict(
                metavar="TOML",
                required=True,
                help="TOML file specifying experimental parameters",
            ),
        ),
        ("--paf-log", dict(
            help="PAF log",
            default="paflog.log",
        )),
        ("--chunk-log", dict(
            help="Chunk log",
            default="chunk_log.log",
        )),
    )
    parser, args = get_parser(extra_args=extra_args, file=__file__)

    # TODO: Move logging config to separate configuration file
    # set up logging to file for DEBUG messages and above
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s %(name)s %(message)s",
        filename=args.log_file,
        filemode="w",
    )

    # define a Handler that writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)

    # set a format which is simpler for console use
    formatter = logging.Formatter(args.log_format)
    console.setFormatter(formatter)

    # add the handler to the root logger
    logging.getLogger("").addHandler(console)

    # Start by logging sys.argv and the parameters used
    logger = logging.getLogger("Manager")
    logger.info(" ".join(sys.argv))
    print_args(args, logger=logger)

    read_until_client = read_until.ReadUntilClient(
        mk_host=args.host,
        mk_port=args.port,
        device=args.device,
        # one_chunk=args.one_chunk,
        filter_strands=True,
        # TODO: test cache_type by passing a function here
        cache_type=args.read_cache,
        cache_size=args.cache_size,
    )

    send_message(
        read_until_client.connection,
        "Read Until is controlling sequencing on this device. You use it at your own risk.",
        Severity.WARN,
    )

    # FIXME: currently flowcell size is not included, this should be pulled from
    #  the read_until_client
    analysis_worker = functools.partial(
        simple_analysis,
        read_until_client,
        unblock_duration=args.unblock_duration,
        throttle=args.throttle,
        batch_size=args.batch_size,
        chunk_log=args.chunk_log,
        paf_log=args.paf_log,
        toml_path=args.toml,
        dry_run=args.dry_run,
    )

    results = run_workflow(
        read_until_client,
        analysis_worker,
        args.workers,
        args.run_time,
        runner_kwargs={
            # "min_chunk_size": args.min_chunk_size,
            "first_channel": min(args.channels),
            "last_channel": max(args.channels),
        },
    )

    # No results returned
    send_message(
        read_until_client.connection,
        "Read Until is disconnected from this device. Sequencing will proceed normally.",
        Severity.WARN,
    )