def test_move_to_storage_with_conflict(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        dt = datetime.datetime.now()
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '')
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
Esempio n. 2
0
    def test_move_to_storage_with_conflict_with_file_types(self):
        dt = datetime.datetime.now()
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 4)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
Esempio n. 3
0
    def test_direct_ingest_multiple_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))
    def test_direct_ingest_multiple_file_moves(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))
Esempio n. 5
0
    def test_direct_ingest_multiple_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file_2.csv'),
                                file_type_differentiation_on=True)
    def _move_files_for_date(self, subdir_path_str: str):
        """Function that loops through each subdirectory and moves files in each subdirectory using the from path
        and to path specified."""

        from_dir_path = GcsfsDirectoryPath.from_absolute_path(
            subdir_path_str.rstrip('/'))

        previous_date_format = from_dir_path.relative_path.rstrip('/').split(
            '/')[-1]
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        from_paths = gsutil_ls(f'{subdir_path_str}*.csv')
        for from_path in from_paths:
            file_name = GcsfsFilePath(
                bucket_name=self.region_storage_dir_path.bucket_name,
                blob_name=from_path).file_name

            to_file_path = os.path.join(
                'gs://', self.region_storage_dir_path.bucket_name,
                self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value,
                new_date_format, file_name)

            normalized_to_file_path = to_normalized_processed_file_path_from_normalized_path(
                to_file_path,
                file_type_override=GcsfsDirectIngestFileType.RAW_DATA)

            to_path = normalized_to_file_path

            if not self.dry_run:
                gsutil_mv(from_path=from_path, to_path=to_path)
            with self.mutex:
                self.move_list.append((from_path, to_path))

        if self.move_progress:
            self.move_progress.next()