Esempio n. 1
0
def runloop(file_checkers, feeder, poll_time):
    """ Main program loop. This will check for new files in the passed input directories using file_check_generator,
    push any new files found into the passed Feeder subclass via its feed() method, wait for poll_time,
    and repeat forever.
    """
    last_time = time.time()
    while True:
        for file_checker in file_checkers:
            # this should never throw StopIteration, will just yield an empty list if nothing is avail:
            filebatch = feeder.feed(next(file_checker))
            if filebatch:
                global_logger.get().info("Pushed %d files, last: %s",
                                         len(filebatch),
                                         os.path.basename(filebatch[-1]))

        removedfiles = feeder.clean()
        if removedfiles:
            global_logger.get().info("Removed %d temp files, last: %s",
                                     len(removedfiles),
                                     os.path.basename(removedfiles[-1]))

        next_time = last_time + poll_time
        try:
            time.sleep(next_time - time.time())
        except IOError, e:
            if e.errno == errno.EINVAL:
                # passed a negative number, which is fine, just don't sleep
                pass
            else:
                raise e
        last_time = next_time
Esempio n. 2
0
def file_check_generator(source_dir, mod_buffer_time, max_files=-1, filename_predicate=None):
    """Generator function that polls the passed directory tree for new files, using the updating_walk.py logic.

    This generator will restart the underlying updating_walk at the last seen file if the updating walk runs
    out of available files.
    """
    next_batch_file, walker_restart_file = None, None
    walker = uw(source_dir, filefilterfunc=filename_predicate)
    while True:
        filebatch = []
        files_left = max_files
        try:
            if not next_batch_file:
                next_batch_file = next(walker)
                walker_restart_file = next_batch_file

            delta = time.time() - os.stat(next_batch_file).st_mtime
            while delta > mod_buffer_time and files_left:
                filebatch.append(next_batch_file)
                files_left -= 1
                next_batch_file = None  # reset in case of exception on next line
                next_batch_file = next(walker)
                delta = time.time() - os.stat(next_batch_file).st_mtime
                walker_restart_file = next_batch_file

        except StopIteration:
            # no files left, restart after polling interval
            if not filebatch:
                global_logger.get().info("Out of files, waiting...")
            walker = uw(source_dir, walker_restart_file, filefilterfunc=filename_predicate)
        yield filebatch
Esempio n. 3
0
def runloop(file_checkers, feeder, poll_time):
    """ Main program loop. This will check for new files in the passed input directories using file_check_generator,
    push any new files found into the passed Feeder subclass via its feed() method, wait for poll_time,
    and repeat forever.
    """
    last_time = time.time()
    while True:
        for file_checker in file_checkers:
            # this should never throw StopIteration, will just yield an empty list if nothing is avail:
            filebatch = feeder.feed(next(file_checker))
            if filebatch:
                global_logger.get().info("Pushed %d files, last: %s", len(filebatch), os.path.basename(filebatch[-1]))

        removedfiles = feeder.clean()
        if removedfiles:
            global_logger.get().info("Removed %d temp files, last: %s", len(removedfiles), os.path.basename(removedfiles[-1]))

        next_time = last_time + poll_time
        try:
            time.sleep(next_time - time.time())
        except IOError, e:
            if e.errno == errno.EINVAL:
                # passed a negative number, which is fine, just don't sleep
                pass
            else:
                raise e
        last_time = next_time
def main():
    _handler = logging.StreamHandler(sys.stdout)
    _handler.setFormatter(
        logging.Formatter('%(levelname)s:%(name)s:%(asctime)s:%(message)s'))
    global_logger.get().addHandler(_handler)
    global_logger.get().setLevel(logging.INFO)

    opts = parse_options()

    fname_to_qname_fcn, fname_to_timepoint_fcn = get_parsing_functions(opts)
    feeder = SyncSeriesFeeder(opts.outdir,
                              opts.linger_time, (opts.imgprefix, ),
                              shape=opts.shape,
                              dtype=opts.dtype,
                              linear=opts.linear,
                              indtype=opts.indtype,
                              fname_to_qname_fcn=fname_to_qname_fcn,
                              fname_to_timepoint_fcn=fname_to_timepoint_fcn)

    file_checkers = build_filecheck_generators(
        opts.imgdatadir,
        opts.mod_buffer_time,
        max_files=opts.max_files,
        filename_predicate=fname_to_qname_fcn)
    runloop(file_checkers, feeder, opts.poll_time)
Esempio n. 5
0
 def check_sequence(self, timepoint_string):
     if self.last_timepoint is None:
         self.last_timepoint = int(timepoint_string)
         return
     cur_timepoint = int(timepoint_string)
     if cur_timepoint != self.last_timepoint + 1:
         global_logger.get().warn("Missing timepoints detected, went from '%d' to '%d'",
                            self.last_timepoint, cur_timepoint)
     self.last_timepoint = cur_timepoint
Esempio n. 6
0
    def check_and_pop_mismatches(self, first_elts):
        """Checks for a mismatched first elements across queues.

        If the first mismatched element has remained the same for longer than
        self.mismatch_wait_time, then start popping out mismatching elements.

        Updates self.last_mismatch and self.last_mismatch_time
        """
        comp_elt = first_elts[0]
        # the below list comprehension Does The Right Thing for first_elts of length 1
        # and returns [], and all([]) is True.
        if comp_elt is not None and all([elt == comp_elt for elt in first_elts[1:]]):
            matched = comp_elt
            self.last_mismatch = None
            self.last_mismatch_time = None
        else:
            matched = None
            # this returns None if there are any Nones in the list:
            cur_mismatch = reduce(min, first_elts)
            if cur_mismatch is None:
                # if there is at least one None, then there is some empty queue
                # we don't consider it a mismatch unless there are first elements in each queue, and
                # they don't match - so if a queue is empty, we don't have a mismatch
                self.last_mismatch = None
                self.last_mismatch_time = None
            else:
                now = time.time()
                if self.last_mismatch:  # we already had a mismatch last time through, presumably on the same elt
                    if self.last_mismatch != cur_mismatch:
                        # blow up
                        raise Exception("Current mismatch '%s' doesn't match last mismatch '%s' " %
                                        (cur_mismatch, self.last_mismatch) + "- this shouldn't happen")
                    if now - self.last_mismatch_time > self.mismatch_wait_time:
                        # we have been stuck on this element for longer than mismatch_wait_time
                        # find the next-lowest element - this is not None, since the other queues are not empty
                        next_elts = first_elts[:]  # copy
                        next_elts.remove(cur_mismatch)
                        next_elt = reduce(min, next_elts)
                        # cycle through *all* queues, removing any elts less than next_elt
                        popping = True
                        while popping:
                            popping = False
                            for qname, q in self.qname_to_queue.iteritems():
                                if q and q[0] < next_elt:
                                    discard = q.popleft()
                                    popping = True
                                    global_logger.get().warn("Discarding item '%s' from queue '%s'; " % (discard, qname) +
                                                       "waited for match for more than %g s" % self.mismatch_wait_time)
                        # finished popping all mismatching elements less than next_elt
                        # we might have a match at this point, but wait for next iteration to pick up
                        self.last_mismatch = None
                        self.last_mismatch_time = None
                else:
                    self.last_mismatch = cur_mismatch
                    self.last_mismatch_time = now
        return matched
Esempio n. 7
0
 def check_sequence(self, timepoint_string):
     if self.last_timepoint is None:
         self.last_timepoint = int(timepoint_string)
         return
     cur_timepoint = int(timepoint_string)
     if cur_timepoint != self.last_timepoint + 1:
         global_logger.get().warn(
             "Missing timepoints detected, went from '%d' to '%d'",
             self.last_timepoint, cur_timepoint)
     self.last_timepoint = cur_timepoint
Esempio n. 8
0
    def match_filenames(self, filenames):
        """Update internal queues with passed filenames. Returns names that match across the head of all queues if
        any are found, or an empty list otherwise.
        """
        # insert
        # we assume that usually we'll just be appending to the end - other options
        # include heapq and bisect, but it probably doesn't really matter
        for filename in filenames:
            qname = self.fname_to_qname_fcn(filename)
            if qname is None:
                global_logger.get().warn(
                    "Could not get queue name for file '%s', skipping" %
                    filename)
                continue
            tpname = self.fname_to_timepoint_fcn(filename)
            if tpname is None:
                global_logger.get().warn(
                    "Could not get timepoint for file '%s', skipping" %
                    filename)
                continue
            self.qname_to_queue[qname].append(tpname)
            self.keys_to_fullnames[(qname, tpname)] = filename

        # maintain sorting and dedup:
        for qname, queue in self.qname_to_queue.iteritems():
            if not is_sorted(queue):
                self.qname_to_queue[qname] = deque(
                    unique_justseen(sorted(list(queue))))

        # all queues are now sorted and unique-ified

        # check for matching first entries across queues
        matching = self.get_matching_first_entry()
        matches = []
        dcs = self.do_check_sequence
        while matching:
            if dcs:
                self.check_sequence(matching)
            matches.append(matching)
            matching = self.get_matching_first_entry()

        # convert matches back to full filenames
        fullnamekeys = list(iproduct(self.qname_to_queue.iterkeys(), matches))
        fullnames = [self.keys_to_fullnames.pop(key) for key in fullnamekeys]
        fullnames.sort()

        # filter out files that are smaller than the first file to be added to the queue, if requested
        # this attempts to check for and work around an error state where some files are incompletely
        # transferred
        if self.qname_to_expected_size is not None:
            fullnames = self.filter_size_mismatch_files(fullnames)

        return fullnames
def main():
    _handler = logging.StreamHandler(sys.stdout)
    _handler.setFormatter(
        logging.Formatter('%(levelname)s:%(name)s:%(asctime)s:%(message)s'))
    global_logger.get().addHandler(_handler)
    global_logger.get().setLevel(logging.INFO)

    opts = parse_options()

    global_logger.get().info("Reading images from: %s", opts.imgdatadir)
    global_logger.get().info("Reading behavioral/ephys data from: %s",
                             opts.behavdatadir)

    fname_to_qname_fcn, fname_to_timepoint_fcn = get_parsing_functions(opts)
    feeder = SyncSeriesFeeder(opts.outdir,
                              opts.linger_time,
                              (opts.imgprefix, opts.behavprefix),
                              shape=opts.shape,
                              dtype=opts.dtype,
                              indtype=opts.indtype,
                              fname_to_qname_fcn=fname_to_qname_fcn,
                              fname_to_timepoint_fcn=fname_to_timepoint_fcn,
                              check_file_size=opts.check_size,
                              check_skip_in_sequence=opts.check_skip)
    file_checkers = build_filecheck_generators(
        (opts.imgdatadir, opts.behavdatadir),
        opts.mod_buffer_time,
        max_files=opts.max_files,
        filename_predicate=fname_to_qname_fcn)
    runloop(file_checkers, feeder, opts.poll_time)
Esempio n. 10
0
    def match_filenames(self, filenames):
        """Update internal queues with passed filenames. Returns names that match across the head of all queues if
        any are found, or an empty list otherwise.
        """
        # insert
        # we assume that usually we'll just be appending to the end - other options
        # include heapq and bisect, but it probably doesn't really matter
        for filename in filenames:
            qname = self.fname_to_qname_fcn(filename)
            if qname is None:
                global_logger.get().warn("Could not get queue name for file '%s', skipping" % filename)
                continue
            tpname = self.fname_to_timepoint_fcn(filename)
            if tpname is None:
                global_logger.get().warn("Could not get timepoint for file '%s', skipping" % filename)
                continue
            self.qname_to_queue[qname].append(tpname)
            self.keys_to_fullnames[(qname, tpname)] = filename

        # maintain sorting and dedup:
        for qname, queue in self.qname_to_queue.iteritems():
            if not is_sorted(queue):
                self.qname_to_queue[qname] = deque(unique_justseen(sorted(list(queue))))

        # all queues are now sorted and unique-ified

        # check for matching first entries across queues
        matching = self.get_matching_first_entry()
        matches = []
        dcs = self.do_check_sequence
        while matching:
            if dcs:
                self.check_sequence(matching)
            matches.append(matching)
            matching = self.get_matching_first_entry()

        # convert matches back to full filenames
        fullnamekeys = list(iproduct(self.qname_to_queue.iterkeys(), matches))
        fullnames = [self.keys_to_fullnames.pop(key) for key in fullnamekeys]
        fullnames.sort()

        # filter out files that are smaller than the first file to be added to the queue, if requested
        # this attempts to check for and work around an error state where some files are incompletely
        # transferred
        if self.qname_to_expected_size is not None:
            fullnames = self.filter_size_mismatch_files(fullnames)

        return fullnames
def main():
    _handler = logging.StreamHandler(sys.stdout)
    _handler.setFormatter(logging.Formatter('%(levelname)s:%(name)s:%(asctime)s:%(message)s'))
    global_logger.get().addHandler(_handler)
    global_logger.get().setLevel(logging.INFO)

    opts = parse_options()

    fname_to_qname_fcn, fname_to_timepoint_fcn = get_parsing_functions(opts)
    feeder = SyncSeriesFeeder(opts.outdir, opts.linger_time, (opts.imgprefix,),
                              shape=opts.shape, dtype=opts.dtype, linear=opts.linear, indtype=opts.indtype,
                              fname_to_qname_fcn=fname_to_qname_fcn, fname_to_timepoint_fcn=fname_to_timepoint_fcn)

    file_checkers = build_filecheck_generators(opts.imgdatadir, opts.mod_buffer_time,
                                               max_files=opts.max_files, filename_predicate=fname_to_qname_fcn)
    runloop(file_checkers, feeder, opts.poll_time)
Esempio n. 12
0
def main():
    _handler = logging.StreamHandler(sys.stdout)
    _handler.setFormatter(logging.Formatter('%(levelname)s:%(name)s:%(asctime)s:%(message)s'))
    global_logger.get().addHandler(_handler)
    global_logger.get().setLevel(logging.INFO)

    opts = parse_options()

    if opts.filter_regex_file:
        pred_fcn = RegexMatchToPredicate.fromFile(opts.filter_regex_file).predicate
    else:
        pred_fcn = None

    feeder = CopyAndMoveFeeder.fromOptions(opts)
    file_checkers = build_filecheck_generators(opts.indir, opts.mod_buffer_time,
                                               max_files=opts.max_files, filename_predicate=pred_fcn)
    runloop(file_checkers, feeder, opts.poll_time)
Esempio n. 13
0
 def filter_size_mismatch_files(self, filenames):
     filtered_timepoints = []
     for filename in filenames:
         size = os.path.getsize(filename)
         bname = os.path.basename(filename)
         queuename = self.fname_to_qname_fcn(bname)
         timepoint = self.fname_to_timepoint_fcn(bname)
         expected_size = self.qname_to_expected_size.setdefault(queuename, size)
         if size != expected_size:
             filtered_timepoints.append(timepoint)
             global_logger.get().warn(
                 "Size mismatch on '%s', discarding timepoint '%s'. (Expected %d bytes, got %d bytes.)",
                 filename, timepoint, expected_size, size)
     if filtered_timepoints:
         return [filename for filename in filenames if
                 self.fname_to_timepoint_fcn(os.path.basename(filename)) not in filtered_timepoints]
     else:
         return filenames
def main():
    _handler = logging.StreamHandler(sys.stdout)
    _handler.setFormatter(
        logging.Formatter('%(levelname)s:%(name)s:%(asctime)s:%(message)s'))
    global_logger.get().addHandler(_handler)
    global_logger.get().setLevel(logging.INFO)

    opts = parse_options()

    if opts.filter_regex_file:
        pred_fcn = RegexMatchToPredicate.fromFile(
            opts.filter_regex_file).predicate
    else:
        pred_fcn = None

    feeder = CopyAndMoveFeeder.fromOptions(opts)
    file_checkers = build_filecheck_generators(opts.indir,
                                               opts.mod_buffer_time,
                                               max_files=opts.max_files,
                                               filename_predicate=pred_fcn)
    runloop(file_checkers, feeder, opts.poll_time)
Esempio n. 15
0
 def filter_size_mismatch_files(self, filenames):
     filtered_timepoints = []
     for filename in filenames:
         size = os.path.getsize(filename)
         bname = os.path.basename(filename)
         queuename = self.fname_to_qname_fcn(bname)
         timepoint = self.fname_to_timepoint_fcn(bname)
         expected_size = self.qname_to_expected_size.setdefault(
             queuename, size)
         if size != expected_size:
             filtered_timepoints.append(timepoint)
             global_logger.get().warn(
                 "Size mismatch on '%s', discarding timepoint '%s'. (Expected %d bytes, got %d bytes.)",
                 filename, timepoint, expected_size, size)
     if filtered_timepoints:
         return [
             filename
             for filename in filenames if self.fname_to_timepoint_fcn(
                 os.path.basename(filename)) not in filtered_timepoints
         ]
     else:
         return filenames
Esempio n. 16
0
def file_check_generator(source_dir,
                         mod_buffer_time,
                         max_files=-1,
                         filename_predicate=None):
    """Generator function that polls the passed directory tree for new files, using the updating_walk.py logic.

    This generator will restart the underlying updating_walk at the last seen file if the updating walk runs
    out of available files.
    """
    next_batch_file, walker_restart_file = None, None
    walker = uw(source_dir, filefilterfunc=filename_predicate)
    while True:
        filebatch = []
        files_left = max_files
        try:
            if not next_batch_file:
                next_batch_file = next(walker)
                walker_restart_file = next_batch_file

            delta = time.time() - os.stat(next_batch_file).st_mtime
            while delta > mod_buffer_time and files_left:
                filebatch.append(next_batch_file)
                files_left -= 1
                next_batch_file = None  # reset in case of exception on next line
                next_batch_file = next(walker)
                delta = time.time() - os.stat(next_batch_file).st_mtime
                walker_restart_file = next_batch_file

        except StopIteration:
            # no files left, restart after polling interval
            if not filebatch:
                global_logger.get().info("Out of files, waiting...")
            walker = uw(source_dir,
                        walker_restart_file,
                        filefilterfunc=filename_predicate)
        yield filebatch
def main():
    _handler = logging.StreamHandler(sys.stdout)
    _handler.setFormatter(logging.Formatter('%(levelname)s:%(name)s:%(asctime)s:%(message)s'))
    global_logger.get().addHandler(_handler)
    global_logger.get().setLevel(logging.INFO)

    opts = parse_options()

    global_logger.get().info("Reading images from: %s", opts.imgdatadir)
    global_logger.get().info("Reading behavioral/ephys data from: %s", opts.behavdatadir)

    fname_to_qname_fcn, fname_to_timepoint_fcn = get_parsing_functions(opts)
    feeder = SyncSeriesFeeder(opts.outdir, opts.linger_time, (opts.imgprefix, opts.behavprefix),
                              shape=opts.shape, dtype=opts.dtype, indtype=opts.indtype,
                              fname_to_qname_fcn=fname_to_qname_fcn,
                              fname_to_timepoint_fcn=fname_to_timepoint_fcn,
                              check_file_size=opts.check_size,
                              check_skip_in_sequence=opts.check_skip)
    file_checkers = build_filecheck_generators((opts.imgdatadir, opts.behavdatadir), opts.mod_buffer_time,
                                               max_files=opts.max_files,
                                               filename_predicate=fname_to_qname_fcn)
    runloop(file_checkers, feeder, opts.poll_time)
Esempio n. 18
0
    def check_and_pop_mismatches(self, first_elts):
        """Checks for a mismatched first elements across queues.

        If the first mismatched element has remained the same for longer than
        self.mismatch_wait_time, then start popping out mismatching elements.

        Updates self.last_mismatch and self.last_mismatch_time
        """
        comp_elt = first_elts[0]
        # the below list comprehension Does The Right Thing for first_elts of length 1
        # and returns [], and all([]) is True.
        if comp_elt is not None and all(
            [elt == comp_elt for elt in first_elts[1:]]):
            matched = comp_elt
            self.last_mismatch = None
            self.last_mismatch_time = None
        else:
            matched = None
            # this returns None if there are any Nones in the list:
            cur_mismatch = reduce(min, first_elts)
            if cur_mismatch is None:
                # if there is at least one None, then there is some empty queue
                # we don't consider it a mismatch unless there are first elements in each queue, and
                # they don't match - so if a queue is empty, we don't have a mismatch
                self.last_mismatch = None
                self.last_mismatch_time = None
            else:
                now = time.time()
                if self.last_mismatch:  # we already had a mismatch last time through, presumably on the same elt
                    if self.last_mismatch != cur_mismatch:
                        # blow up
                        raise Exception(
                            "Current mismatch '%s' doesn't match last mismatch '%s' "
                            % (cur_mismatch, self.last_mismatch) +
                            "- this shouldn't happen")
                    if now - self.last_mismatch_time > self.mismatch_wait_time:
                        # we have been stuck on this element for longer than mismatch_wait_time
                        # find the next-lowest element - this is not None, since the other queues are not empty
                        next_elts = first_elts[:]  # copy
                        next_elts.remove(cur_mismatch)
                        next_elt = reduce(min, next_elts)
                        # cycle through *all* queues, removing any elts less than next_elt
                        popping = True
                        while popping:
                            popping = False
                            for qname, q in self.qname_to_queue.iteritems():
                                if q and q[0] < next_elt:
                                    discard = q.popleft()
                                    popping = True
                                    global_logger.get().warn(
                                        "Discarding item '%s' from queue '%s'; "
                                        % (discard, qname) +
                                        "waited for match for more than %g s" %
                                        self.mismatch_wait_time)
                        # finished popping all mismatching elements less than next_elt
                        # we might have a match at this point, but wait for next iteration to pick up
                        self.last_mismatch = None
                        self.last_mismatch_time = None
                else:
                    self.last_mismatch = cur_mismatch
                    self.last_mismatch_time = now
        return matched