class ChangeListenerNode(Node): """ Filter source files and detect changes. It has two outputs, the default and 'all'. The default output contains only the changed files. The 'all' edge will contain all files from the source. Parameters ---------- stop : bool, optional If True, stop processing the graph if no changes are detected at this node (default True) cache : str, optional Name of the file to cache data in. By default will cache data in memory. key : str, optional Table name to use inside the ``cache`` file. Must be present if ``cache`` is non-None. fingerprint: str or callable Function that takes a file and returns a fingerprint. May also be the strings 'md5' or 'mtime', which will md5sum the file or check the modification time respectively. (default 'md5') """ name = "change_listener" outputs = ("default", "all") def __init__(self, stop=True, cache=None, key=None, fingerprint="md5"): super(ChangeListenerNode, self).__init__() self.stop = stop if cache is None: self.checksums = {} elif key is None: raise ValueError("If cache is provided, must provide a key") else: self.checksums = SqliteDict(cache, key, autocommit=False, synchronous=0) if fingerprint == "md5": self.fingerprint = self._md5 elif fingerprint == "mtime": self.fingerprint = self._mtime else: self.fingerprint = fingerprint def _md5(self, item): """ md5sum a file """ with item.data.open() as filestream: return md5stream(filestream) def _mtime(self, item): """ Get the modification time of a file """ return os.path.getmtime(item.fullpath) def process(self, stream): changed = [] all_items = [] for item in stream: fingerprint = self.fingerprint(item) if fingerprint != self.checksums.get(item.fullpath): self.checksums[item.fullpath] = fingerprint changed.append(item) all_items.append(item) if not changed and self.stop: raise StopProcessing if isinstance(self.checksums, SqliteDict): self.checksums.commit() return {"default": changed, "all": all_items}