Ejemplo n.º 1
0
 def execute(self, args):
     for sample, name in iter_fastx_inputs(args.inputs,
                                           args.pairing_mode,
                                           names=args.names):
         self.status.start_sample(name, sample)
         for n_seqs, time, n_skipped in self.processor.chunked_process(
                 *sample):
             self.status.update(n_seqs)
         self.status.finish_sample()
Ejemplo n.º 2
0
    def setup(self, args):
        if args.dbg:
            self.hasher = args.hasher_t(args.ksize)
            self.storage = args.storage.build(*args.storage_args)
            self.streamer_t = args.graph_t
            self.streamer = args.graph_t.build(self.storage, self.hasher)
        else:
            self.streamer_t = libgoetia.StreamHasher[args.hasher_t]
            self.streamer = self.streamer_t.Hasher.build(args.ksize)

        self.ll_processor = self.streamer_t.Processor.build(
            self.streamer, args.interval)

        sample_iter = iter_fastx_inputs(args.inputs,
                                        args.pairing_mode,
                                        names=args.names)
        self.processor = AsyncSequenceProcessor(self.ll_processor, sample_iter)

        self.worker_listener = self.processor.add_listener(
            'worker_q', 'consumer')

        if args.metrics:
            self.metrics_stream = AsyncJSONStreamWriter(args.metrics)
            self.worker_listener.on_message(
                Interval, StreamHasherRunner.write_metrics_callback,
                self.metrics_stream)

        self.worker_listener.on_message(
            Interval,
            lambda msg, status: \
                status.update(msg.t, msg.sequence, msg.seconds_elapsed_interval),
            self.status
        )

        self.worker_listener.on_message(
            SampleStarted, lambda msg, status: status.start_sample(
                msg.sample_name, msg.file_names), self.status)

        self.worker_listener.on_message(
            SampleFinished, lambda msg, status: status.finish_sample(
                msg.seconds_elapsed_sample), self.status)

        self.worker_listener.on_message(
            Error,
            lambda msg, status: status.message(
                f'ERROR: {msg.sample_name}->{msg.file_names} at time={msg.t}, sequence={msg.sequence} '\
                f'\n-- BEGIN EXCEPTION --\n{msg.error}\n-- END EXCEPTION --'
            ),
            self.status
        )
Ejemplo n.º 3
0
    def setup(self, args):
        # Create the signature and the libgoetia sequence processor: implemented by subclass
        self.signature = self._make_signature(args)
        signature_processor = self._make_processor(self.signature, args)

        # Pass the libgoetia processor over the async handler
        self.processor = AsyncSequenceProcessor(signature_processor,
                                                iter_fastx_inputs(
                                                    args.inputs,
                                                    args.pairing_mode,
                                                    names=args.names),
                                                echo=args.echo)

        # Signatures: held in a RollingPairwise, stores the signatures themselves (if desired)
        # and calls the distance function
        self.sigs = RollingPairwise(self._distance_func, history=1)

        # Smooths the distances over a window and checks the smooth values against
        # a cutoff function to trigger saturation
        self.cutoff = SlidingCutoff(args.window_size,
                                    args.smoothing_function_func,
                                    args.cutoff_function_func)

        #
        # Set up the event handlers. We set up two listeners:
        #     - The worker listener: registered on the worker queue of the async processor;
        #       this is the producer loaded by the libgoetia processor loop.
        #     - The events listener: this is loaded by other handlers. It has everything fom the worker
        #       queue and any events triggered by its subscribers.
        #

        self.worker_listener = self.processor.add_listener(
            'worker_q', 'signature.consumer')
        self.worker_listener.on_message(Interval, self._on_interval,
                                        self.processor.events_q, args, self)

        if args.save_stream:
            self.signature_stream = AsyncJSONStreamWriter(args.save_stream)
            self.worker_listener.on_message(
                Interval,
                every_n_intervals(self._stream_write, n=args.save_stream_tick),
                args, self)

        if args.term_graph:
            self.init_term_graph_io(args)
        else:
            self.init_std_io(args)
Ejemplo n.º 4
0
    def execute(self, args):
        warned_fp = False
        for (sample, name), sample_start_time, _ in \
            time_iterable(iter_fastx_inputs(args.inputs, args.pairing_mode, names=args.names)):

            self.status.start_sample(name, sample)
            for (n_seqs, stream_time, n_skipped), \
                 interval_start_time, interval_elapsed_time in \
                 time_iterable(self.processor.chunked_process(*sample)):

                n_passed = self.processor.n_passed()
                p_passed = n_passed / n_seqs
                passed = f'{n_passed:,} ({p_passed * 100.0:.1f}%)'
                fp_rate = self.dbg.estimated_fp()
                self.status.update(stream_time, n_seqs, interval_elapsed_time,
                                   self.dbg.n_unique(), passed, fp_rate)

                if args.metrics:
                    self.metrics_stream.write({
                        't':
                        stream_time,
                        'seq_t':
                        n_seqs,
                        'rt_elapsed_interval':
                        interval_elapsed_time,
                        'n_seqs_passed':
                        n_passed,
                        'p_seqs_passed':
                        p_passed,
                        'n_unique_kmers':
                        self.dbg.n_unique(),
                        'estimated_fp':
                        fp_rate
                    })

                if fp_rate >= 0.8 and warned_fp is False:
                    self.status.message(
                        f'WARNING: false positive rate has surpassed 0.8 ({fp_rate:.7f})'
                    )
                    warned_fp = True

            sample_elapsed_time = time.perf_counter() - sample_start_time
            self.status.finish_sample(sample_elapsed_time)
Ejemplo n.º 5
0
    def setup(self, args):
        os.makedirs(args.results_dir, exist_ok=True)

        self.dbg_t = args.graph_t
        self.hasher = args.hasher_t(args.ksize)
        self.storage = args.storage.build(*args.storage_args)
        self.dbg = args.graph_t.build(self.storage, self.hasher)

        self.cdbg_t = libgoetia.cDBG[type(self.dbg)]

        self.compactor_t = libgoetia.StreamingCompactor[type(self.dbg)]

        self.compactor = self.compactor_t.Compactor.build(self.dbg)

        if args.normalize:
            self.file_processor = self.compactor_t.NormalizingCompactor[
                FastxReader].build(self.compactor, args.normalize,
                                   args.interval)
        else:
            self.file_processor = self.compactor_t.Processor.build(
                self.compactor, args.interval)

        # Iterator over samples (pairs or singles, depending on pairing-mode)
        sample_iter = iter_fastx_inputs(args.inputs,
                                        args.pairing_mode,
                                        names=args.names)
        # AsyncSequenceProcessor does event management and callback for the FileProcessors
        self.processor = AsyncSequenceProcessor(self.file_processor,
                                                sample_iter, args.echo)
        # Subscribe a listener to the FileProcessor producer
        self.worker_listener = self.processor.add_listener(
            'worker_q', 'cdbg.consumer')

        #
        # Register callbacks for data outputs.
        # Track a list of files that need to be closed with a ]
        # when we're done.
        #
        self.to_close = []

        if args.track_cdbg_metrics:
            self.metrics_stream = AsyncJSONStreamWriter(
                args.track_cdbg_metrics)
            self.worker_listener.on_message(Interval,
                                            write_cdbg_metrics_callback,
                                            self.compactor,
                                            self.metrics_stream)

        if args.track_unitig_bp:
            if args.unitig_bp_bins is None:
                bins = [args.ksize, 100, 200, 500, 1000]
            else:
                bins = args.unitig_bp_bins

            self.unitig_bp_stream = AsyncJSONStreamWriter(args.track_unitig_bp)
            self.worker_listener.on_message(
                Interval,
                every_n_intervals(compute_unitig_fragmentation_callback,
                                  n=args.unitig_bp_tick),
                self.cdbg_t,
                self.compactor.cdbg,
                self.unitig_bp_stream,
                bins,
                verbose=args.verbose)

        if args.track_cdbg_components:
            comp_callback = None
            if args.cdbg_components_tick == 'fib':
                comp_callback = every_fib_intervals(
                    compute_connected_component_callback)
            elif args.cdbg_components_tick == 'exp':
                comp_callback = every_exp_intervals(
                    compute_connected_component_callback)
            else:
                comp_callback = every_n_intervals(
                    compute_connected_component_callback,
                    n=int(args.cdbg_components_tick))

            self.components_stream = AsyncJSONStreamWriter(
                args.track_cdbg_components)
            self.worker_listener.on_message(Interval,
                                            comp_callback,
                                            self.cdbg_t,
                                            self.compactor.cdbg,
                                            self.components_stream,
                                            args.component_sample_size,
                                            verbose=args.verbose)

        if args.save_cdbg:
            for cdbg_format in args.save_cdbg_format:
                self.worker_listener.on_message(Interval,
                                                every_n_intervals(
                                                    write_cdbg_callback,
                                                    n=args.cdbg_tick),
                                                self.compactor.cdbg,
                                                args.save_cdbg,
                                                cdbg_format,
                                                time_component='',
                                                verbose=args.verbose)
                self.worker_listener.on_message(SampleFinished,
                                                write_cdbg_callback,
                                                self.compactor.cdbg,
                                                args.save_cdbg,
                                                cdbg_format,
                                                verbose=args.verbose)

        if args.validate:
            self.worker_listener.on_message(SampleFinished,
                                            validate_cdbg_callback,
                                            self.compactor.cdbg, args.validate)

        #
        # Regular diagnostics output
        #

        self.worker_listener.on_message(
            Interval,
            lambda msg, status, compactor: \
                status.update(msg.t, msg.sequence, msg.seconds_elapsed_interval, compactor),
            self.status,
            self.compactor
        )

        self.worker_listener.on_message(
            SampleStarted, lambda msg, status: status.start_sample(
                msg.sample_name, msg.file_names), self.status)

        self.worker_listener.on_message(
            SampleFinished, lambda msg, status: status.finish_sample(
                msg.seconds_elapsed_sample), self.status)

        self.worker_listener.on_message(
            Error,
            lambda msg, status: status.message(
                f'ERROR: {msg.sample_name}->{msg.file_names} at time={msg.t}, sequence={msg.sequence} '\
                f'\n-- BEGIN EXCEPTION --\n{msg.error}\n-- END EXCEPTION --'
            ),
            self.status
        )
Ejemplo n.º 6
0
import argparse
import curio

from goetia import libgoetia
from goetia.hashing import StrandAware
from goetia.parsing import iter_fastx_inputs, get_fastx_args
from goetia.processors import AsyncSequenceProcessor
from goetia.storage import *
from goetia.timer import measure_time

parser = argparse.ArgumentParser()
group = get_fastx_args(parser)
group.add_argument('-i', dest='inputs', nargs='+', required=True)
args = parser.parse_args()

for storage_t in [HLLStorage]:
    sig_t = libgoetia.sketches.UnikmerSketch[storage_t, StrandAware]
    sig = sig_t.Sketch.build(31, 10)
    proc = AsyncSequenceProcessor(
        sig_t.Processor.build(sig, 100000),
        iter_fastx_inputs(args.inputs, args.pairing_mode, names=args.names))

    with measure_time():
        print(f'Storage: {storage_t}')
        curio.run(proc.start)

    print(sig.to_numpy())