def _prototypes_from_dir(self, dir_path): # For each directory of CSVs, we store a header of files and # individual, per-file pkls. # Since the source will do the directory by itself, cached and uncached # structure behave differently here! cache_path = self._cache_path(dir_path) try: if self.force_refresh: raise OSError # get list of files with open(cache_path, 'rb') as cache_pkl: job_csv_paths = pickle.load(cache_pkl) # get job files individually to allow refreshing any for job_csv_path in job_csv_paths: for prototype in self._prototypes_from_csv(job_csv_path): yield prototype except (OSError, IOError, EOFError): dir_prototype_lock = filelock.FileLock( os.path.splitext(cache_path)[0] + '.lock') try: with dir_prototype_lock.acquire(timeout=0): # clean up broken pickles if os.path.exists(cache_path): os.unlink(cache_path) self._logger.warning('Refreshing existing cache %r', cache_path) except filelock.Timeout: pass data_source = self.data_source job_files = [] for job in data_source.jobs(path=dir_path): job.prepare_traffic() prototype = Prototype.from_job(job) yield prototype assert job.path not in job_files, \ "Job file may not contain multiple jobs (%r)" % job.path job_cache_path = self._cache_path(job.path) cache_prototype_lock = filelock.FileLock( os.path.splitext(job_cache_path)[0] + '.lock') try: with cache_prototype_lock.acquire(timeout=0): # store the job individually, just remember its file with open(job_cache_path, 'wb') as job_cache_pkl: pickle.dump([prototype], job_cache_pkl, pickle.HIGHEST_PROTOCOL) except filelock.Timeout: pass job_files.append(job.path) try: with dir_prototype_lock.acquire(timeout=0): with open(cache_path, 'wb') as cache_pkl: pickle.dump(job_files, cache_pkl, pickle.HIGHEST_PROTOCOL) except filelock.Timeout: pass
def test_from_job(self): file_path = os.path.join( os.path.dirname(assess_tests.__file__), "data/c01-007-102/1/1-process.csv" ) data_source = FileDataSource() for job in data_source.jobs(path=file_path): prototype = Prototype.from_job(job) self.assertIsNotNone(prototype) self.assertEqual(prototype.node_count(), 9109) last_tme = 0 for node in prototype.nodes(order_first=True): self.assertTrue(last_tme <= node.tme) last_tme = node.tme
def _prototypes_from_csv(self, csv_path): # For each individual CSV, we store *all* its content to one pkl. # That content may be multiple prototypes, so we yield it! cache_path = self._cache_path(csv_path) try: if self.force_refresh: raise OSError with open(cache_path, 'rb') as cache_pkl: prototypes = pickle.load(cache_pkl) except (OSError, IOError, EOFError): if self.preloaded_only: yield None # serialize pickle creation in case multiple processes use the # same prototype cache_prototype_lock = filelock.FileLock( os.path.splitext(cache_path)[0] + '.lock') try: # try to become the writer and create the pickle with cache_prototype_lock.acquire(timeout=0): # clean up broken pickles if os.path.exists(cache_path): os.unlink(cache_path) self._logger.warning('Refreshing existing cache %r', cache_path) data_source = self.data_source prototypes = [] for job in data_source.jobs(path=csv_path): job.prepare_traffic() prototype = Prototype.from_job(job) prototypes.append(prototype) if prototypes: with open(cache_path, 'wb') as cache_pkl: pickle.dump(prototypes, cache_pkl, pickle.HIGHEST_PROTOCOL) except filelock.Timeout: # we are NOT the writer - acquire the lock to see when the # writer is done with cache_prototype_lock: with open(cache_path, 'rb') as cache_pkl: prototypes = pickle.load(cache_pkl) for prototype in prototypes: yield prototype