def test_end_filter(self): _collector = file_collector.Collector({helper.END_FILTER: ['txt']}) expected = ['b.txt'] actual = [ os.path.basename(x) for x in _collector.collect(self.test_dir) ] self.assertEquals(expected, actual)
def test_in_filter(self): _collector = file_collector.Collector({helper.IN_FILTER: ['a.']}) expected = ['a.csv'] actual = [ os.path.basename(x) for x in _collector.collect(self.test_dir) ] self.assertEquals(expected, actual)
def test_simple(self): _collector = file_collector.Collector({}) expected = ['a.csv', 'b.txt'] actual = [ os.path.basename(x) for x in _collector.collect(self.test_dir) ] self.assertEquals(expected, actual)
def test_file(self): _collector = file_collector.Collector({helper.END_FILTER: ['csv']}) expected = ['a.csv'] actual = [ os.path.basename(x) for x in _collector.collect(os.path.join(self.test_dir, 'a.csv')) ] self.assertEquals(expected, actual)
def test_negate(self): _collector = file_collector.Collector({ helper.END_FILTER: ['txt'], helper.NEGATE: 1 }) expected = ['a.csv'] actual = [ os.path.basename(x) for x in _collector.collect(self.test_dir) ] self.assertEquals(expected, actual)
def boot(self, injector, config, match_path): """ Initialize workers and queue. Find all matching documents and add these to the pipeline. :param injector: Object from which to fetch dependencies. :param config: Configuration object. :param match_path: Path passed to ``glob()`` to identify folders and files. :param injector: Object from which to fetch dependencies. :type config: ``dict`` :type match_path: ``unicode`` :returns: Processed paths. """ logger = logging.getLogger('boot') data_root = config.get(helper.DATA_ROOT, 'local_data') config[helper.DATA_ROOT] = data_root if not os.path.exists(config[helper.DATA_ROOT]): os.makedirs(config[helper.DATA_ROOT]) queue = Queue(maxsize=2) workers = [] for worker_id in range(config.get(helper.WORKERS, 1)): proc = Process(target=self.init, args=(config, queue, worker_id, injector)) workers.append(proc) proc.start() paths = [] for path in glob.glob(match_path): paths.extend([ x for x in collector.Collector(config.get( helper.COLLECTOR, {})).collect(path) ]) for path in tqdm(paths): queue.put(path) running = 1 while running > 0: running = 0 for worker in workers: if worker.is_alive(): running += 1 time.sleep(0.01) logger.info('Stopping') return paths