def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self._get_file_tag_rank_list())

        self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
Exemple #2
0
 def __init__(
     self,
     region_code: str,
     dry_run: bool,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.dry_run = dry_run
     self.project_id = 'recidiviz-123'
     self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_'
         f'{datetime.datetime.now().isoformat()}.txt')
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
 def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str,
              start_date_bound: Optional[str],
              end_date_bound: Optional[str], dry_run: bool, project_id: str,
              file_filter: Optional[str]):
     self.file_type = file_type
     self.region_code = region_code
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.file_filter = file_filter
     self.project_id = project_id
     self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             self.file_type,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}'
         f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
Exemple #5
0
 def create_export_manager(self, region):
     metadata_manager = PostgresDirectIngestFileMetadataManager(region.region_code)
     return DirectIngestIngestViewExportManager(
         region=region,
         fs=FakeDirectIngestGCSFileSystem(),
         ingest_directory_path=GcsfsDirectoryPath.from_absolute_path('ingest_bucket'),
         big_query_client=self.mock_client,
         file_metadata_manager=metadata_manager,
         view_collector=_ViewCollector(region, controller_file_tags=['ingest_view']))
    def _copy_files_for_date(self, subdir_path_str: str):
        dir_path = GcsfsDirectoryPath.from_absolute_path(subdir_path_str.rstrip('/'))

        from_path = f'gs://{self.prod_region_storage_dir_path.bucket_name}/{dir_path.relative_path}*'
        to_path = f'gs://{self.staging_region_storage_dir_path.bucket_name}/{dir_path.relative_path}'

        if not self.dry_run:
            gsutil_cp(from_path=from_path, to_path=to_path)
        with self.mutex:
            self.copy_list.append((from_path, to_path))
            if self.copy_progress:
                self.copy_progress.next()
    def __init__(self, project_id: str, region: str,
                 file_type_to_move: GcsfsDirectIngestFileType,
                 destination_file_type: GcsfsDirectIngestFileType,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool,
                 file_filter: Optional[str]):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if self.file_type_to_move != self.destination_file_type and \
                self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED:
            raise ValueError(
                'Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED'
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
    def __init__(self, region_code: str, file_type: GcsfsDirectIngestFileType,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool):
        self.file_type = file_type
        self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id='recidiviz-123'))
        self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE,
                project_id='recidiviz-staging'))
        self.dry_run = dry_run
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound

        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
        self.mutex = threading.Lock()
        self.copy_list: List[Tuple[str, str]] = []
        self.copy_progress: Optional[Bar] = None
Exemple #9
0
    def __init__(self, paths: str, project_id: str, region: str, date: str,
                 dry_run: bool):

        self.paths = paths
        self.project_id = project_id
        self.region = region.lower()
        self.datetime = datetime.datetime.fromisoformat(date)
        self.dry_run = dry_run

        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))

        self.mutex = threading.Lock()
        self.move_progress: Optional[Bar] = None
        self.copies_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'upload_to_ingest_result_{region}_{self.project_id}_date_{self.datetime.date().isoformat()}'
            f'_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
    def _move_files_for_date(self, subdir_path_str: str):
        """Function that loops through each subdirectory and moves files in each subdirectory using the from path
        and to path specified."""

        from_dir_path = GcsfsDirectoryPath.from_absolute_path(
            subdir_path_str.rstrip('/'))

        previous_date_format = from_dir_path.relative_path.rstrip('/').split(
            '/')[-1]
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        from_paths = gsutil_ls(f'{subdir_path_str}*.csv')
        for from_path in from_paths:
            file_name = GcsfsFilePath(
                bucket_name=self.region_storage_dir_path.bucket_name,
                blob_name=from_path).file_name

            to_file_path = os.path.join(
                'gs://', self.region_storage_dir_path.bucket_name,
                self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value,
                new_date_format, file_name)

            normalized_to_file_path = to_normalized_processed_file_path_from_normalized_path(
                to_file_path,
                file_type_override=GcsfsDirectIngestFileType.RAW_DATA)

            to_path = normalized_to_file_path

            if not self.dry_run:
                gsutil_mv(from_path=from_path, to_path=to_path)
            with self.mutex:
                self.move_list.append((from_path, to_path))

        if self.move_progress:
            self.move_progress.next()
class TestGcsfsDirectIngestJobPrioritizer(unittest.TestCase):
    """Tests for the GcsfsDirectIngestJobPrioritizer."""

    _DAY_1_TIME_1 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=6789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_2 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=7789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_3 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=10,
                                      minute=4,
                                      second=5,
                                      microsecond=678,
                                      tzinfo=datetime.timezone.utc)

    _DAY_2_TIME_1 = datetime.datetime(year=2019,
                                      month=1,
                                      day=3,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=6789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1 = _DAY_1_TIME_1.date()
    _DAY_2 = _DAY_2_TIME_1.date()

    _INGEST_BUCKET_PATH = \
        GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures')

    def setUp(self) -> None:
        self.fs = FakeDirectIngestGCSFileSystem()
        self.prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'])

    FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures'

    def _normalized_path_for_filename(self, filename: str,
                                      dt: datetime.datetime) -> GcsfsFilePath:
        normalized_path = \
            to_normalized_unprocessed_file_path(
                os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                             filename), dt)
        return GcsfsFilePath.from_absolute_path(normalized_path)

    def _process_jobs_for_paths_with_no_gaps_in_expected_order(
            self, paths: List[GcsfsFilePath]):
        for path in paths:
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            if next_job_args is None:
                # Make mypy happy
                self.fail()
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

    def test_empty_fs(self):
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1_TIME_1.date().isoformat()))
        self.assertIsNone(self.prioritizer.get_next_job_args())

    def test_single_expected_file(self):
        path = self._normalized_path_for_filename('tagA.csv',
                                                  self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order([path])

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagB
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files(self):

        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2)
        ]

        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_unexpected_file(self):
        # Only file is out of order
        path = self._normalized_path_for_filename('tagB.csv',
                                                  self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

        next_job_args = self.prioritizer.get_next_job_args()
        self.assertIsNotNone(next_job_args)
        self.assertEqual(next_job_args.file_path, path)
        self.assertFalse(
            self.prioritizer.are_next_args_expected(next_job_args))

        # ... job runs eventually even though unexpected...

        self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagA
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_files_on_multiple_days(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_files_on_multiple_days_with_gap(self):
        """Runs a test where there are files on multiple days and there is a gap
        in the expected files for the first day.
        """
        paths = [
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)

            are_args_expected = \
                self.prioritizer.are_next_args_expected(next_job_args)
            if i == 0:
                self.assertFalse(are_args_expected)
            else:
                self.assertTrue(are_args_expected)

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_multiple_files_same_tag(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files_times_out_of_order(self):
        """Runs a test where there are no gaps but the files have been added
        (i.e. have creation times) out of order.
        """
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            are_more_jobs_expected = \
                self.prioritizer.are_more_jobs_expected_for_day(date_str)
            if i == 2:
                self.assertFalse(are_more_jobs_expected)
            else:
                self.assertTrue(are_more_jobs_expected)

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_run_multiple_copies_of_same_tag(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA_2.csv',
                                               self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))