Beispiel #1
0
 def test_end_filter(self):
     _collector = file_collector.Collector({helper.END_FILTER: ['txt']})
     expected = ['b.txt']
     actual = [
         os.path.basename(x) for x in _collector.collect(self.test_dir)
     ]
     self.assertEquals(expected, actual)
Beispiel #2
0
 def test_in_filter(self):
     _collector = file_collector.Collector({helper.IN_FILTER: ['a.']})
     expected = ['a.csv']
     actual = [
         os.path.basename(x) for x in _collector.collect(self.test_dir)
     ]
     self.assertEquals(expected, actual)
Beispiel #3
0
 def test_simple(self):
     _collector = file_collector.Collector({})
     expected = ['a.csv', 'b.txt']
     actual = [
         os.path.basename(x) for x in _collector.collect(self.test_dir)
     ]
     self.assertEquals(expected, actual)
Beispiel #4
0
 def test_file(self):
     _collector = file_collector.Collector({helper.END_FILTER: ['csv']})
     expected = ['a.csv']
     actual = [
         os.path.basename(x)
         for x in _collector.collect(os.path.join(self.test_dir, 'a.csv'))
     ]
     self.assertEquals(expected, actual)
Beispiel #5
0
 def test_negate(self):
     _collector = file_collector.Collector({
         helper.END_FILTER: ['txt'],
         helper.NEGATE: 1
     })
     expected = ['a.csv']
     actual = [
         os.path.basename(x) for x in _collector.collect(self.test_dir)
     ]
     self.assertEquals(expected, actual)
Beispiel #6
0
    def boot(self, injector, config, match_path):
        """
    Initialize workers and queue. Find all matching documents and add these
    to the pipeline.

    :param injector: Object from which to fetch dependencies.
    :param config: Configuration object.
    :param match_path: Path passed to ``glob()`` to identify folders and files.
    :param injector: Object from which to fetch dependencies.
    :type config: ``dict``
    :type match_path: ``unicode``
    :returns: Processed paths.
    """
        logger = logging.getLogger('boot')

        data_root = config.get(helper.DATA_ROOT, 'local_data')
        config[helper.DATA_ROOT] = data_root

        if not os.path.exists(config[helper.DATA_ROOT]):
            os.makedirs(config[helper.DATA_ROOT])

        queue = Queue(maxsize=2)
        workers = []

        for worker_id in range(config.get(helper.WORKERS, 1)):
            proc = Process(target=self.init,
                           args=(config, queue, worker_id, injector))

            workers.append(proc)
            proc.start()

        paths = []

        for path in glob.glob(match_path):
            paths.extend([
                x for x in collector.Collector(config.get(
                    helper.COLLECTOR, {})).collect(path)
            ])

        for path in tqdm(paths):
            queue.put(path)

        running = 1

        while running > 0:
            running = 0
            for worker in workers:
                if worker.is_alive():
                    running += 1

            time.sleep(0.01)

        logger.info('Stopping')

        return paths