def test_01_push_and_checkout(self): cpath = 'credentials-json' init_repository('dataset', self, store_type='gdriveh', profile=cpath) add_file(self, 'dataset', '--bumpversion', 'new') metadata_path = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'metadata') self.assertIn( messages[17] % (metadata_path, os.path.join('computer-vision', 'images', 'dataset-ex')), check_output('ml-git dataset commit dataset-ex')) HEAD = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'refs', 'dataset-ex', 'HEAD') self.assertTrue(os.path.exists(HEAD)) self.assertNotIn(ERROR_MESSAGE, check_output('ml-git dataset push dataset-ex')) os.chdir(metadata_path) tag = 'computer-vision__images__dataset-ex__2' self.assertIn(tag, check_output('git describe --tags')) os.chdir(self.tmp_dir) workspace = os.path.join(self.tmp_dir, 'dataset') clear(workspace) clear(os.path.join(self.tmp_dir, ML_GIT_DIR)) init_repository('dataset', self, store_type='gdriveh', profile=cpath) self.assertNotIn(ERROR_MESSAGE, check_output('ml-git dataset checkout %s' % tag)) objects = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'objects') refs = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'refs') cache = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'cache') spec_file = os.path.join(self.tmp_dir, 'dataset', 'computer-vision', 'images', 'dataset-ex', 'dataset-ex.spec') file = os.path.join(self.tmp_dir, 'dataset', 'computer-vision', 'images', 'dataset-ex', 'newfile0') self.assertTrue(os.path.exists(objects)) self.assertTrue(os.path.exists(refs)) self.assertTrue(os.path.exists(cache)) self.assertTrue(os.path.exists(file)) self.assertTrue(os.path.exists(spec_file))
def add_storage_type(self, bucket, profile, storage_type): self.assertIn( output_messages['INFO_INITIALIZED_PROJECT_IN'] % self.tmp_dir, check_output(MLGIT_INIT)) result = check_output(MLGIT_STORAGE_ADD_WITH_TYPE % (bucket, profile, storage_type)) if storage_type == STORAGE_TYPE: self.assertIn( output_messages['INFO_ADD_STORAGE'] % (storage_type, bucket, profile), result) else: self.assertIn( output_messages['INFO_ADD_STORAGE_WITHOUT_PROFILE'] % (storage_type, bucket), result) with open(os.path.join(ML_GIT_DIR, 'config.yaml'), 'r') as c: config = yaml_processor.load(c) return config
def _checkout_entity(self, entity_type, tag=DATASET_TAG, bare=True): init_repository(entity_type, self) self.assertIn( output_messages['INFO_MLGIT_PULL'] % (os.path.join(self.tmp_dir, ML_GIT_DIR, entity_type, 'metadata')), check_output(MLGIT_UPDATE % entity_type)) if bare: self.assertIn( output_messages['INFO_CHECKOUT_BARE_MODE'], check_output(MLGIT_CHECKOUT % (entity_type, tag + ' --bare'))) else: self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_CHECKOUT % (entity_type, tag))) self.assertTrue( os.path.exists( os.path.join(self.tmp_dir, DATASETS, DATASET_NAME, 'data', 'file1')))
def test_03_initialize_dataset_from_subfolder(self): self.set_up_init('dataset', os.path.join(self.tmp_dir, GIT_PATH)) os.chdir(os.path.join(self.tmp_dir, ML_GIT_DIR)) self.assertIn( messages[8] % (os.path.join(self.tmp_dir, GIT_PATH), os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'metadata')), check_output(MLGIT_ENTITY_INIT % 'dataset'))
def test_03_checkout(self): os.makedirs(self.workspace) create_spec(self, self.repo_type, self.tmp_dir, version=1, mutability='strict', store_type=self.store_type) self.assertIn(messages[0], check_output(MLGIT_INIT)) self.assertIn( messages[2] % (GIT_PATH, self.repo_type), check_output(MLGIT_REMOTE_ADD % (self.repo_type, GIT_PATH))) self.assertIn( messages[87] % (self.store_type, self.bucket), check_output('ml-git repository store add %s --type=%s' % (self.bucket, self.store_type))) self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_ENTITY_INIT % 'dataset')) add_file(self, self.repo_type, '', 'new') metadata_path = os.path.join(ML_GIT_DIR, 'dataset', 'metadata') self.assertIn( messages[17] % (os.path.join(self.tmp_dir, metadata_path), os.path.join('computer-vision', 'images', 'dataset-ex')), check_output(MLGIT_COMMIT % (self.repo_type, 'dataset-ex', ''))) HEAD = os.path.join(ML_GIT_DIR, 'dataset', 'refs', 'dataset-ex', 'HEAD') self.assertTrue(os.path.exists(HEAD)) self.assertEqual(os.getenv('AZURE_STORAGE_CONNECTION_STRING'), self.dev_store_account_) self.assertNotIn( ERROR_MESSAGE, check_output(MLGIT_PUSH % (self.repo_type, 'dataset-ex'))) clear(self.workspace) clear(os.path.join(ML_GIT_DIR, 'dataset')) self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_ENTITY_INIT % self.repo_type)) self.assertEqual(os.getenv('AZURE_STORAGE_CONNECTION_STRING'), self.dev_store_account_) self.assertNotIn( ERROR_MESSAGE, check_output( MLGIT_CHECKOUT % (self.repo_type, 'computer-vision__images__dataset-ex__1'))) ws_path = os.path.join(self.tmp_dir, 'dataset', 'computer-vision', 'images', 'dataset-ex') self.assertTrue(os.path.isfile(os.path.join(ws_path, 'newfile0'))) self.assertTrue(os.path.isfile(os.path.join(ws_path, 'newfile1'))) self.assertTrue(os.path.isfile(os.path.join(ws_path, 'newfile2'))) self.assertTrue(os.path.isfile(os.path.join(ws_path, 'newfile3'))) self.assertTrue(os.path.isfile(os.path.join(ws_path, 'newfile4')))
def test_26_adding_data_based_in_older_tag(self): entity = DATASETS self.set_up_checkout(entity) self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_CHECKOUT % (entity, DATASET_TAG))) workspace = os.path.join(self.tmp_dir, entity, entity + '-ex') create_file(workspace, 'newfile5', '0', file_path='') populate_entity_with_new_data(self, entity) self.assertNotIn( ERROR_MESSAGE, check_output(MLGIT_CHECKOUT % (DATASETS, DATASET_TAG))) expected_files_in_tag_1 = 6 self.check_amount_of_files(entity, expected_files_in_tag_1, sampling=False) create_file(workspace, 'newfile6', '0', file_path='') populate_entity_with_new_data(self, entity, bumpversion='', version='--version=3') clear(os.path.join(self.tmp_dir, ML_GIT_DIR, entity)) clear(workspace) self.assertIn( output_messages['INFO_METADATA_INIT'] % (os.path.join(self.tmp_dir, GIT_PATH), os.path.join(self.tmp_dir, ML_GIT_DIR, entity, 'metadata')), check_output(MLGIT_ENTITY_INIT % entity)) self.assertNotIn( ERROR_MESSAGE, check_output(MLGIT_CHECKOUT % (entity, 'computer-vision__images__datasets-ex__3'))) path_of_tag_2_file = os.path.join(self.tmp_dir, entity, entity + '-ex', 'newfile5') path_of_tag_3_file = os.path.join(self.tmp_dir, entity, entity + '-ex', 'newfile6') self.assertFalse(os.path.exists(path_of_tag_2_file)) self.assertTrue(os.path.exists(path_of_tag_3_file)) expected_files_in_tag_3 = 7 self.check_amount_of_files(entity, expected_files_in_tag_3, sampling=False)
def test_02_fetch_with_group_sample(self): self.set_up_fetch() self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=group --sampling=1:3 --seed=4')) hashfs = os.path.join(ML_GIT_DIR, 'dataset', 'objects', 'hashfs') self.assertTrue(os.path.exists(hashfs))
def test_07_update_some_entities(self): self._setup_update_entity(DATASETS) self._setup_update_entity(MODELS) response = check_output(MLGIT_REPOSITORY_UPDATE) self._check_update_output(response, DATASETS, MODELS) self.assertNotIn( output_messages['INFO_MLGIT_PULL'] % os.path.join(self.tmp_dir, ML_GIT_DIR, LABELS, 'metadata'), response)
def test_01_status_after_put_on_new_file_in_dataset(self): self.set_up_status(DATASETS) create_file(os.path.join(self.tmp_dir, DATASETS, DATASET_NAME), 'file', '0', '') self.assertRegex( check_output(MLGIT_STATUS % (DATASETS, DATASET_NAME)), DATASET_NO_COMMITS_INFO_REGEX + r'Untracked files:\s+' + DATASET_ADD_INFO_REGEX + r'datasets-ex\.spec\s+' r'file')
def test_05_fetch_with_range_sample(self): self.set_up_fetch() self.assertNotIn( ERROR_MESSAGE, check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=range --sampling=2:4:1'))
def test_14_random_sample_with_frequency_greater_or_equal_list_size(self): self.set_up_fetch() self.assertIn( messages[31], check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=random --sampling=2:10 --seed=3'))
def test_11_checkout_with_random_sample(self): self.set_up_fetch() self.assertNotIn( ERROR_MESSAGE, check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=random --sampling=2:3 --seed=3'))
def test_03_unlock_flexible_mode(self): self.set_up_unlock(DATASETS, FLEXIBLE) self.assertEqual(2, os.stat(self.file_path).st_nlink) self.assertIn( output_messages['INFO_PERMISSIONS_CHANGED_FOR'] % 'data/file1', check_output(MLGIT_UNLOCK % (DATASETS, DATASET_NAME, 'data/file1'))) self.assertTrue(os.access(self.file_path, os.W_OK))
def test_02_unlock_wrong_file(self): self.set_up_unlock(DATASETS, FLEXIBLE) self.assertEqual(2, os.stat(self.file_path).st_nlink) self.assertIn( output_messages['ERROR_FILE_NOT_FOUND'] % 'data/file10', check_output(MLGIT_UNLOCK % (DATASETS, DATASET_NAME, 'data/file10'))) self.assertEqual(2, os.stat(self.file_path).st_nlink)
def test_01_unlock_in_strict_mode(self): self.set_up_unlock(DATASETS, STRICT) self.assertEqual(2, os.stat(self.file_path).st_nlink) self.assertIn( output_messages['INFO_MUTABILITY_CANNOT_BE_STRICT'], check_output(MLGIT_UNLOCK % (DATASETS, DATASET_NAME, 'data/file1'))) self.assertEqual(2, os.stat(self.file_path).st_nlink)
def test_07_range_sample_with_start_parameter_less_than_zero(self): self.set_up_fetch() self.assertIn( messages[42], check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=range --sampling=-3:2:1'))
def test_09_range_sample_with_start_parameter_equal_to_stop(self): self.set_up_fetch() self.assertIn( messages[23], check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=range --sampling=2:2:1'))
def _remote_del(self, entity_type): with open(os.path.join(self.tmp_dir, ML_GIT_DIR, 'config.yaml'), 'r') as c: config = yaml_processor.load(c) git_url = config[entity_type]['git'] self.assertIn( output_messages['INFO_REMOVE_REMOTE'] % (git_url, entity_type), check_output(MLGIT_REMOTE_DEL % entity_type))
def test_12_random_sample_with_frequency_less_or_equal_zero(self): self.set_up_fetch() self.assertIn( messages[40], check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=random --sampling=2:-2 --seed=3'))
def test_04_push_with_wrong_repository(self): init_repository('dataset', self) add_file(self, 'dataset', '--bumpversion', 'new') metadata_path = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'metadata') self.assertIn(messages[17] % (metadata_path, os.path.join('computer-vision', 'images', 'dataset-ex')), check_output(MLGIT_COMMIT % ('dataset', 'dataset-ex', ''))) HEAD = os.path.join(self.tmp_dir, ML_GIT_DIR, 'dataset', 'refs', 'dataset-ex', 'HEAD') self.assertTrue(os.path.exists(HEAD)) git_path = os.path.join(self.tmp_dir, GIT_PATH) clear(git_path) output = check_output(MLGIT_PUSH % ('dataset', 'dataset-ex')) self.assertIn(ERROR_MESSAGE, output) self.assertIn(git_path, output)
def test_04_group_sample_with_seed_parameter_negative(self): self.set_up_fetch() self.assertIn( messages[41], check_output( MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1') + ' --sample-type=group --sampling=1:2 --seed=-4'))
def test_02_clone_folder_non_empty(self): os.mkdir(CLONE_FOLDER) with open(os.path.join(CLONE_FOLDER, "file"), "wt") as file: file.write("0" * 2048) self.assertIn( messages[45] % (os.path.join(self.tmp_dir, CLONE_FOLDER)), check_output(MLGIT_CLONE % (self.GIT_CLONE, "--folder=" + CLONE_FOLDER)))
def test_01_models_metrics(self): repo_type = MODELS self.set_up_test(repo_type) output = check_output(MLGIT_MODELS_METRICS % ('{}-ex'.format(repo_type), '')) self.assertIn(self.TAG % 1, output) self.assertIn(self._create_info_table(tag_version=0), output) self.assertIn(self.TAG % 2, output) self.assertIn(self._create_info_table(tag_version=1), output)
def test_05_commit_tag_that_already_exists(self): entity_type = DATASETS self._commit_entity(entity_type) with open( os.path.join(self.tmp_dir, entity_type, entity_type + '-ex', 'newfile5'), 'wt') as z: z.write(str('0' * 100)) self.assertIn( output_messages['INFO_ADDING_PATH'] % DATASETS, check_output(MLGIT_ADD % (entity_type, entity_type + '-ex', ''))) self.assertIn( output_messages['INFO_TAG_ALREADY_EXISTS'] % 'computer-vision__images__datasets-ex__2', check_output(MLGIT_COMMIT % (entity_type, entity_type + '-ex', ''))) head_path = os.path.join(self.tmp_dir, ML_GIT_DIR, entity_type, 'refs', entity_type + '-ex', 'HEAD') self.assertTrue(os.path.exists(head_path))
def test_06_hard_entity_with_changed_dir(self): entity_type = DATASETS artifact_name = DATASET_NAME init_repository(entity_type, self) create_file(os.path.join(entity_type, artifact_name), 'file1', '0', '') self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_ADD % (entity_type, artifact_name, '--bumpversion'))) self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_COMMIT % (entity_type, artifact_name, ''))) create_file(os.path.join(entity_type, artifact_name), 'file2', '0', '') entity_dir, workspace, workspace_with_dir = move_entity_to_dir(self.tmp_dir, artifact_name, entity_type) self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_ADD % (entity_type, artifact_name, '--bumpversion'))) self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_COMMIT % (entity_type, artifact_name, ''))) new_file_path = os.path.join(workspace_with_dir, artifact_name, 'file2') self.assertFalse(os.path.exists(workspace)) self.assertTrue(os.path.exists(new_file_path)) self.assertIn(output_messages['INFO_INITIALIZING_RESET'] % ('--hard', 'HEAD~1'), check_output(MLGIT_RESET % (entity_type, artifact_name) + ' --hard --reference=head~1')) self.assertFalse(os.path.exists(new_file_path)) self.assertTrue(os.path.exists(workspace))
def test_06_commit_with_large_version_number(self): init_repository(DATASETS, self) create_spec(self, DATASETS, self.tmp_dir) self.assertIn( output_messages['ERROR_INVALID_VALUE_FOR'] % ('--version', '9999999999'), check_output(MLGIT_COMMIT % (DATASETS, DATASET_NAME, ' --version=9999999999'))) self.assertIn( output_messages['ERROR_INVALID_VALUE_FOR'] % ('--version', '9999999999'), check_output(MLGIT_COMMIT % (MODELS, MODELS + '-ex', ' --version=9999999999'))) self.assertIn( output_messages['ERROR_INVALID_VALUE_FOR'] % ('--version', '9999999999'), check_output(MLGIT_COMMIT % (LABELS, LABELS + '-ex', ' --version=9999999999')))
def test_01_fetch_metadata_specific_tag(self): self.set_up_fetch() self.assertNotIn(ERROR_MESSAGE, check_output(MLGIT_FETCH % ('dataset', 'computer-vision__images__dataset-ex__1'))) hashfs = os.path.join(ML_GIT_DIR, 'dataset', 'objects', 'hashfs') self.assertTrue(os.path.exists(hashfs))
def test_07_update_some_entities(self): self._setup_update_entity('dataset') self._setup_update_entity('model') response = check_output(MLGIT_REPOSITORY_UPDATE) self._check_update_output(response, 'dataset', 'model') self.assertNotIn( messages[37] % os.path.join(self.tmp_dir, ML_GIT_DIR, 'labels', 'metadata'), response)
def test_01_status_after_put_on_new_file_in_dataset(self): self.set_up_status(DATASETS) data_path = os.path.join(self.tmp_dir, DATASETS, DATASET_NAME, 'data') os.makedirs(data_path, exist_ok=True) create_file(data_path, 'file', '0', '') self.assertRegex( check_output(MLGIT_STATUS_SHORT % (DATASETS, DATASET_NAME)), r'Changes to be committed:\s+Untracked files:(\s|.)*data(\\|/)file(\s|.)*' )
def test_04_export_metrics_without_export_path(self): repo_type = MODELS entity_name = '{}-ex'.format(repo_type) self.set_up_test(repo_type) self.assertIn( output_messages['ERROR_MISSING_EXPORT_PATH'], check_output( MLGIT_MODELS_METRICS % (entity_name, ' --export-type={}'.format(FileType.CSV.value))))