Beispiel #1
0
 def generated_chunked_parallelized_results(
     self,
     partially_bound_function,
     tasks,
     n_processes,
     chunksize=1,
 ):
     with Pool(n_processes, maxtasksperchild=1) as pool:
         for result in pool.map(
                 partially_bound_function,
             [list(task_batch) for task_batch in Batch(tasks, chunksize)]):
             yield result
Beispiel #2
0
    def process_query_tasks(self, query_tasks):
        """Run queries by table

        Will run preparation (e.g. create table) and finalize (e.g. create index) tasks
        in the main process,
        but delegate inserts to rq Jobs in batches of 25

        Args: query_tasks (dict) - keys should be table names and values should be dicts.
            Each inner dict should have up to three keys, each with a list of queries:
            'prepare' (setting up the table),
            'inserts' (insert commands to populate the table),
            'finalize' (finishing table setup after all inserts have run)

            Example: {
                'table_one': {
                    'prepare': ['create table table_one (col1 varchar)'],
                    'inserts': [
                        'insert into table_one values (\'a\')',
                        'insert into table_one values (\'b'\')'
                    ]
                    'finalize': ['create index on table_one (col1)']
                }
            }
        """
        for table_name, tasks in query_tasks.items():
            logger.spam(f"Processing features for {table_name}")
            self.feature_generator.run_commands(tasks.get("prepare", []))

            insert_batches = [
                list(task_batch)
                for task_batch in Batch(tasks.get("inserts", []), 25)
            ]

            jobs = [
                self.queue.enqueue(
                    self.feature_generator.run_commands,
                    insert_batch,
                    job_timeout=DEFAULT_TIMEOUT,
                    result_ttl=DEFAULT_TIMEOUT,
                    ttl=DEFAULT_TIMEOUT,
                ) for insert_batch in insert_batches
            ]

            self.wait_for(jobs)

            self.feature_generator.run_commands(tasks.get("finalize", []))
            logger.debug(f"{table_name} completed")
Beispiel #3
0
    def process_query_tasks(self, query_tasks):
        logging.info("Processing query tasks with %s processes",
                     self.n_db_processes)
        for table_name, tasks in query_tasks.items():
            logging.info("Processing features for %s", table_name)
            self.feature_generator.run_commands(tasks.get("prepare", []))
            partial_insert = partial(insert_into_table,
                                     feature_generator=self.feature_generator)

            insert_batches = [
                list(task_batch)
                for task_batch in Batch(tasks.get("inserts", []), 25)
            ]
            parallelize(partial_insert,
                        insert_batches,
                        n_processes=self.n_db_processes)
            self.feature_generator.run_commands(tasks.get("finalize", []))
            logging.info("%s completed", table_name)
Beispiel #4
0
 def generated_chunked_parallelized_results(self,
                                            partially_bound_function,
                                            tasks,
                                            n_processes,
                                            chunksize=1):
     with ProcessPool(n_processes, max_tasks=1) as pool:
         future = pool.map(
             partially_bound_function,
             [list(task_batch) for task_batch in Batch(tasks, chunksize)],
         )
         iterator = future.result()
         while True:
             try:
                 yield next(iterator)
             except StopIteration:
                 break
             except Exception:
                 logging.exception('Child failure')