Beispiel #1
0
 def test_get_job_name(self) -> None:
     self.assertEqual(beam_tables.get_job_name('base.scan_echo', False),
                      'write-base-scan-echo')
     self.assertEqual(beam_tables.get_job_name('base.scan_discard', True),
                      'append-base-scan-discard')
     self.assertEqual(beam_tables.get_job_name('laplante.scan_http', False),
                      'write-laplante-scan-http')
     self.assertEqual(beam_tables.get_job_name('laplante.scan_https', True),
                      'append-laplante-scan-https')
Beispiel #2
0
def run_parallel_pipelines(runner: beam_tables.ScanDataBeamPipelineRunner,
                           dataset: str,
                           scan_types: List[str],
                           incremental_load: bool,
                           start_date: Optional[datetime.date] = None,
                           end_date: Optional[datetime.date] = None) -> bool:
    """Runs beam pipelines for different scan types in parallel.

  Args:
    runner: ScanDataBeamPipelineRunner to run pipelines
    dataset: dataset name to write to like 'prod' or 'laplante
    scan_types: list of scan types to run ['echo' 'http']
    incremental_load: boolean. If true, only load the latest new data, if false
      reload all data.
    start_date: date object, only files after or at this date will be read.
      Mostly only used during development.
    end_date: date object, only files at or before this date will be read.
      Mostly only used during development.

  Returns:
    True on success

  Raises:
    Exception: if any of the pipelines fail or don't finish.
  """
    with concurrent.futures.ThreadPoolExecutor() as pool:
        futures = []
        for scan_type in scan_types:
            table_name = beam_tables.get_table_name(
                dataset, scan_type, beam_tables.BASE_TABLE_NAME)
            job_name = beam_tables.get_job_name(table_name, incremental_load)

            future = pool.submit(runner.run_beam_pipeline, scan_type,
                                 incremental_load, job_name, table_name,
                                 start_date, end_date)
            futures.append(future)

        finished, pending = concurrent.futures.wait(
            futures, return_when=concurrent.futures.FIRST_EXCEPTION)

        # Raise any exceptions
        for future in finished:
            future.result()

        if pending:
            raise Exception('Some pipelines failed to finish: ', pending,
                            'finished: ', finished)
        return True