def test_secrets(): client = python_pachyderm.Client() secret_name = util.test_repo_name("test-secrets") client.create_secret( secret_name, { "mykey": "my-value", }, ) secret = client.inspect_secret(secret_name) assert secret.secret.name == secret_name secrets = client.list_secret() assert len(secrets) == 1 assert secrets[0].secret.name == secret_name client.delete_secret(secret_name) with pytest.raises(python_pachyderm.RpcError): client.inspect_secret(secret_name) secrets = client.list_secret() assert len(secrets) == 0
def test_spout_commit(): client = python_pachyderm.Client() client.delete_all() client.create_pipeline( pipeline_name="pipeline-spout-commit", transform=pps_proto.Transform( cmd=["bash"], stdin=[ "echo 'commit time' >> file.txt", "pachctl put file pipeline-spout-commit@master:/file.txt -f file.txt", ], ), spout=pps_proto.Spout(), ) c = client.subscribe_commit( repo_name="pipeline-spout-commit", branch="master", state=pfs_proto.FINISHED, origin_kind=pfs_proto.USER, ) next(c) commit_infos = list(client.list_commit("pipeline-spout-commit")) assert len(commit_infos) == 1
def test_put_files(): client = python_pachyderm.Client() repo_name = util.create_test_repo(client, "put_files") with tempfile.TemporaryDirectory(suffix="python_pachyderm") as d: # create a temporary directory with these files: # 0.txt 1.txt 2.txt 3.txt 4.txt 0/0.txt 1/1.txt 2/2.txt # 3/3.txt 4/4.txt for i in range(5): os.makedirs(os.path.join(d, str(i))) for j in range(5): with open(os.path.join(d, "{}.txt".format(j)), "w") as f: f.write(str(j)) with open(os.path.join(d, str(j), "{}.txt".format(j)), "w") as f: f.write(str(j)) # add the files under both `/` and `/sub` (the latter redundantly to # test both for correct path handling and the ability to put files # that already exist) commit = "{}/master".format(repo_name) python_pachyderm.put_files(client, d, commit, "/") python_pachyderm.put_files(client, d, commit, "/sub") python_pachyderm.put_files(client, d, commit, "/sub/") expected = set(["/", "/sub"]) for i in range(5): expected.add("/{}".format(i)) expected.add("/{}.txt".format(i)) expected.add("/{}/{}.txt".format(i, i)) expected.add("/sub/{}".format(i)) expected.add("/sub/{}.txt".format(i)) expected.add("/sub/{}/{}.txt".format(i, i)) check_expected_files(client, commit, expected)
def test_delete_all_transactions(): client = python_pachyderm.Client() client.start_transaction() client.start_transaction() assert len(client.list_transaction()) == 2 client.delete_all_transactions() assert len(client.list_transaction()) == 0
def __init__(self, commit, path_prefix="/", pachy_host=os.environ['PACHYDERM_HOST_URI'], pachy_port="30650", local_root='/data', transform=T.Compose([ T.Resize((256, 256)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])): self.commit = commit self.path_prefix = path_prefix self.local_root = local_root self.client = python_pachyderm.Client(host=pachy_host, port=pachy_port) self.meta_path_lst = [{ 'path': res.file.path, 'size': res.size_bytes } for res in self.client.glob_file(commit, path_prefix + "meta.json")] self._download_data_from_pachyderm(self.meta_path_lst, self.path_prefix + "meta.json") with open(os.path.join(self.local_root, "meta.json")) as meta_f: meta = json.load(meta_f) self.meta = meta self.class_names = self.meta["class_names"] self.num_classes = len(self.class_names) self.transform = transform
def test_create_pipeline_from_request(): client = python_pachyderm.Client() repo_name = util.create_test_repo(client, "test_create_pipeline_from_request") pipeline_name = util.test_repo_name("test_create_pipeline_from_request") # more or less a copy of the opencv demo's edges pipeline spec client.create_pipeline_from_request( pps_proto.CreatePipelineRequest( pipeline=pps_proto.Pipeline(name=pipeline_name), description="A pipeline that performs image edge detection by using the OpenCV library.", input=pps_proto.Input( pfs=pps_proto.PFSInput( glob="/*", repo=repo_name, ), ), transform=pps_proto.Transform( cmd=["echo", "hi"], image="pachyderm/opencv", ), ) ) assert any(p.pipeline.name == pipeline_name for p in list(client.list_pipeline()))
def load(): output = {} output['client'] = Algorithmia.client() output['pach_client'] = python_pachyderm.Client( host=os.environ["PACH_HOST"], port=os.environ["PACH_PORT"], auth_token=os.environ["PACH_AUTH"], tls=True) # Download the model and config Path("/tmp/trained_model").mkdir(parents=True, exist_ok=True) with open("/tmp/trained_model/config.json", "wb") as f: f.write(output['pach_client'].get_file(("train_model", MODEL_VERSION), "config.json").read()) with open("/tmp/trained_model/pytorch_model.bin", "wb") as f: f.write(output['pach_client'].get_file(("train_model", MODEL_VERSION), "pytorch_model.bin").read()) output['classification_model_path'] = Path("/tmp/trained_model/") output[ 'classification_model'] = AutoModelForSequenceClassification.from_pretrained( output['classification_model_path'], cache_dir=None, num_labels=3) nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') return output
def download_pach_repo(pachyderm_host, pachyderm_port, repo, branch, root): client = python_pachyderm.Client(host=pachyderm_host, port=pachyderm_port) files = [] if not os.path.exists(root): os.makedirs(root) for file in client.walk_file((repo, branch), "/"): files.append(file) args = [] count = 0 fpaths = [] for i in range(len(files)): path = files[i].file.path fpath = os.path.join(root, path[1:]) if files[i].file_type == 2: os.makedirs(fpath, exist_ok=True) else: fpaths.append((path, fpath)) for path, fpath in fpaths: contents = client.get_file((repo, branch), path) f = open(fpath, 'wb') for bytes in contents: f.write(bytes) f.close() if fpath.endswith('.tar.gz'): tarfile.open(fpath).extractall(path=root)
def main(): # Connects to a pachyderm cluster on the default host:port # (`localhost:30650`). This will work for certain environments (e.g. k8s # running on docker for mac), as well as when port forwarding is being # used. For other setups, you'll want one of the alternatives: # 1) To connect to pachyderm when this script is running inside the # cluster, use `python_pachyderm.Client.new_in_cluster()`. # 2) To connect to pachyderm via a pachd address, use # `python_pachyderm.Client.new_from_pachd_address`. # 3) To explicitly set the host and port, pass parameters into # `python_pachyderm.Client()`. client = python_pachyderm.Client() # Create a repo called images client.create_repo("images") # Create a pipeline specifically designed for executing python code. This # is equivalent to the edges pipeline in the standard opencv example. python_pachyderm.create_python_pipeline( client, relpath("edges"), input=python_pachyderm.Input( pfs=python_pachyderm.PFSInput(glob="/*", repo="images")), ) # Create the montage pipeline client.create_pipeline( "montage", transform=python_pachyderm.Transform( cmd=["sh"], image="v4tech/imagemagick", stdin=[ "montage -shadow -background SkyBlue -geometry 300x300+2+2 $(find /pfs -type f | sort) /pfs/out/montage.png" ], ), input=python_pachyderm.Input(cross=[ python_pachyderm.Input( pfs=python_pachyderm.PFSInput(glob="/", repo="images")), python_pachyderm.Input( pfs=python_pachyderm.PFSInput(glob="/", repo="edges")), ]), ) with client.commit("images", "master") as commit: # Add some images, recursively inserting content from the images # directory. Alternatively, you could use `client.put_file_url` or # `client_put_file_bytes`. python_pachyderm.put_files(client, relpath("images"), commit, "/") # Wait for the commit (and its downstream commits) to finish for _ in client.flush_commit([commit]): pass # Get the montage source_file = client.get_file("montage/master", "/montage.png") with tempfile.NamedTemporaryFile(suffix="montage.png", delete=False) as dest_file: shutil.copyfileobj(source_file, dest_file) print("montage written to {}".format(dest_file.name))
def __init__(self, test_name): client = python_pachyderm.Client() commit, input_repo_name, pipeline_repo_name = util.create_test_pipeline( client, test_name) self.client = client self.commit = commit self.input_repo_name = input_repo_name self.pipeline_repo_name = pipeline_repo_name
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('--host', required=True, help='Only the hostname of a grpc URL.') parser.add_argument('--port', required=True, help='The port number.') parser.add_argument('--specification', required=True, help='A DAG end node pipeline specification path.') parser.add_argument('--specifications', required=True, help='A path containing pipeline specification files.') args = parser.parse_args() host = args.host port = int(args.port) specification = Path(args.specification) specifications = Path(args.specifications) print(f'host: {host}') print(f'port: {port}') print(f'specification: {specification}') print(f'specifications: {specifications}') client = python_pachyderm.Client(host=host, port=port) parser = PipelineSpecificationParser(specification, specifications) dag_manager = DagManager(parser) dag_builder = dag_manager.get_dag_builder() pipeline_names = dag_builder.get_pipeline_names() total_upload = 0 total_download = 0 total_process = 0 for pipeline_name in pipeline_names: job = data_finder.get_latest_job(client, pipeline_name) if job is None: print(f'No jobs are available for {pipeline_name}') else: job_data = data_finder.get_job_run_times(job) upload_time = job_data.get('upload') download_time = job_data.get('download') process_time = job_data.get('process') datums_processed = job_data.get('datums_processed') print(f'pipeline: {pipeline_name} ' f'upload time: {upload_time} sec. ' f'download time: {download_time} sec. ' f'process time {process_time} sec. ' f'datums processed {datums_processed}') if upload_time is not None: total_upload += upload_time if download_time is not None: total_download += download_time if process_time is not None: total_process += process_time print(f'total upload: {total_upload} sec. ' f'total download: {total_download} sec. ' f'total_process: {total_process} sec. ')
def test_delete_all_repos(): client = python_pachyderm.Client() util.create_test_repo(client, "test_delete_all_repos", prefix="extra-1") util.create_test_repo(client, "test_delete_all_repos", prefix="extra-2") assert len(list(client.list_repo())) >= 2 client.delete_all_repos() assert len(list(client.list_repo())) == 0
def test_transaction_context_mgr_nested(): client = python_pachyderm.Client() with client.transaction(): assert client.transaction_id is not None old_transaction_id = client.transaction_id with client.transaction(): assert client.transaction_id is not None assert client.transaction_id != old_transaction_id assert client.transaction_id == old_transaction_id
def test_put_files_single_file(): client = python_pachyderm.Client() client.delete_all() repo_name = util.create_test_repo(client, "put_files_single_file") with tempfile.NamedTemporaryFile() as f: f.write(b"abcd") f.flush() commit = (repo_name, "master") python_pachyderm.put_files(client, f.name, commit, "/f1.txt") python_pachyderm.put_files(client, f.name, commit, "/f/f1") expected = set(["/", "/f1.txt", "/f/", "/f/f1"]) check_expected_files(client, commit, expected)
def test_create_python_pipeline_bad_path(): client = python_pachyderm.Client() repo_name = util.create_test_repo(client, "create_python_pipeline_bad_path") # create some sample data with client.commit(repo_name, "master") as commit: client.put_file_bytes(commit, 'file.dat', b'DATA') # create a pipeline from a file that does not exist - should fail with pytest.raises(Exception): python_pachyderm.create_python_pipeline( client, "./foobar2000", input=python_pachyderm.Input(pfs=python_pachyderm.PFSInput(glob="/", repo=repo_name)), )
def test_pachyderm_version(): global _test_pachyderm_version if _test_pachyderm_version is None: value = os.environ.get("PACHYDERM_VERSION") if value is None: client = python_pachyderm.Client() value = client.get_remote_version() _test_pachyderm_version = (value.major, value.minor, value.micro) else: _test_pachyderm_version = tuple(int(i) for i in value.split(".")) return _test_pachyderm_version
def test_transaction_context_mgr_exception(): client = python_pachyderm.Client() expected_repo_count = len(list(client.list_repo())) with pytest.raises(Exception): with client.transaction(): util.create_test_repo(client, "test_transaction_context_mgr_exception") util.create_test_repo(client, "test_transaction_context_mgr_exception") raise Exception("oops!") assert len(client.list_transaction()) == 0 assert len(list(client.list_repo())) == expected_repo_count
def download_data(self) -> str: data_config = self.context.get_data_config() data_dir = os.path.join(self.download_directory, 'data') pachyderm_host = data_config['pachyderm']['host'] pachyderm_port = data_config['pachyderm']['port'] pach_client = python_pachyderm.Client(host=pachyderm_host, port=pachyderm_port) download_pach_repo( pachyderm_host, pachyderm_port, data_config["pachyderm"]["repo"], data_config["pachyderm"]["branch"], data_dir, ) return data_dir
def test_create_spout(): client = python_pachyderm.Client() client.delete_all() client.create_pipeline( pipeline_name="pipeline-create-spout", transform=pps_proto.Transform( cmd=["sh"], image="alpine", ), spout=pps_proto.Spout(), ) assert len(list(client.list_pipeline())) == 1
def client(): pc = python_pachyderm.Client() pc.activate_license(os.environ["PACH_PYTHON_ENTERPRISE_CODE"]) pc.add_cluster("localhost", "localhost:1650", secret="secret") pc.activate_enterprise("localhost:1650", "localhost", "secret") pc.auth_token = "iamroot" pc.activate_auth(pc.auth_token) pc.set_identity_server_config(config=identity_proto.IdentityServerConfig( issuer="http://localhost:1658")) yield pc # not redundant because auth_token could be overriden by tests pc.auth_token = "iamroot" pc.delete_all() pc.deactivate_enterprise()
def test_create_pipeline(): client = python_pachyderm.Client() client.delete_all() input_repo_name = util.create_test_repo(client, "input_repo_test_create_pipeline") client.create_pipeline( "pipeline_test_create_pipeline", transform=pps_proto.Transform( cmd=["sh"], image="alpine", stdin=["cp /pfs/{}/*.dat /pfs/out/".format(input_repo_name)], ), input=pps_proto.Input(pfs=pps_proto.PFSInput(glob="/*", repo=input_repo_name)), ) assert len(list(client.list_pipeline())) == 1
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('--host', required=True, help='Only the hostname part of a grpc URL.') parser.add_argument('--port', required=True, help='The port number.') parser.add_argument('--pipeline', required=True, help='A pipeline name.') args = parser.parse_args() host = args.host port = int(args.port) pipeline_name = args.pipeline client = python_pachyderm.Client(host=host, port=port) job = get_latest_job(client, pipeline_name) if job is None: print(f'No jobs are available for {pipeline_name}.') else: get_job_run_times(job)
def test_list_commit(): python_pachyderm.Client().delete_all_repos() client, repo_name1 = sandbox("list_commit1") with client.commit(repo_name1, "master"): pass with client.commit(repo_name1, "master"): pass repo_name2 = util.create_test_repo(client, "list_commit2") with client.commit(repo_name2, "master"): pass commits = list(client.list_commit()) assert len(commits) == 3
def test_delete_transaction(): client = python_pachyderm.Client() expected_repo_count = len(list(client.list_repo())) transaction = client.start_transaction() util.create_test_repo(client, "test_delete_transaction") util.create_test_repo(client, "test_delete_transaction") client.delete_transaction(transaction) assert len(client.list_transaction()) == 0 # even though the transaction was deleted, the repos were still created, # because the transaction wasn't tied to the client assert len(list(client.list_repo())) == expected_repo_count + 2 with pytest.raises(python_pachyderm.RpcError): # re-deleting should cause an error client.delete_transaction(transaction)
def test_batch_transaction(): client = python_pachyderm.Client() expected_repo_count = len(list(client.list_repo())) + 3 def create_repo_request(): return transaction_pb2.TransactionRequest( create_repo=pfs_pb2.CreateRepoRequest(repo=pfs_pb2.Repo( name=util.test_repo_name("test_batch_transaction")))) client.batch_transaction([ create_repo_request(), create_repo_request(), create_repo_request(), ]) assert len(client.list_transaction()) == 0 assert len(list(client.list_repo())) == expected_repo_count
def test_transaction_context_mgr(): client = python_pachyderm.Client() expected_repo_count = len(list(client.list_repo())) + 2 with client.transaction() as transaction: util.create_test_repo(client, "test_transaction_context_mgr") util.create_test_repo(client, "test_transaction_context_mgr") transactions = client.list_transaction() assert len(transactions) == 1 assert transactions[0].transaction.id == transaction.id assert client.inspect_transaction( transaction).transaction.id == transaction.id assert (client.inspect_transaction( transaction.id).transaction.id == transaction.id) assert len(client.list_transaction()) == 0 assert len(list(client.list_repo())) == expected_repo_count
def main(): client = python_pachyderm.Client() client.create_pipeline( pipeline_name="spout", transform=pps_proto.Transform( cmd=["python3", "consumer/main.py"], image="pachyderm/example-spout101:2.0.0-beta.5", ), spout=pps_proto.Spout(), description= "A spout pipeline that emulates the reception of data from an external source", ) client.create_pipeline( pipeline_name="processor", transform=pps_proto.Transform( cmd=["python3", "processor/main.py"], image="pachyderm/example-spout101:2.0.0-beta.5", ), input=pps_proto.Input( pfs=pps_proto.PFSInput(repo="spout", branch="master", glob="/*")), description="A pipeline that sorts 1KB vs 2KB files", ) client.create_pipeline( pipeline_name="reducer", transform=pps_proto.Transform( cmd=["bash"], stdin=[ "set -x", "FILES=/pfs/processor/*/*", "for f in $FILES", "do", "directory=`dirname $f`", "out=`basename $directory`", "cat $f >> /pfs/out/${out}.txt", "done", ], ), input=pps_proto.Input(pfs=pps_proto.PFSInput( repo="processor", branch="master", glob="/*")), description="A pipeline that reduces 1K/ and 2K/ directories", )
def test_enterprise(): client = python_pachyderm.Client() client.delete_all_license() client.activate_license(os.environ["PACH_PYTHON_ENTERPRISE_CODE"]) client.add_cluster("localhost", "localhost:1650", secret="secret") client.update_cluster("localhost", "localhost:1650", "localhost:16650") client.activate_enterprise("localhost:1650", "localhost", "secret") assert len(client.list_clusters()) == len(client.list_user_clusters()) assert client.get_enterprise_state().state == enterprise_proto.State.ACTIVE assert ( client.get_activation_code().activation_code == os.environ["PACH_PYTHON_ENTERPRISE_CODE"] ) client.delete_cluster("localhost") client.deactivate_enterprise() client.delete_all_license()
def sandbox(): client = python_pachyderm.Client() client.activate_enterprise(os.environ["PACH_PYTHON_ENTERPRISE_CODE"]) root_auth_token = None try: root_auth_token = client.activate_auth("robot:root") client.auth_token = root_auth_token try: yield client finally: try: client.deactivate_auth() client.auth_token = None except: print( "an exception occurred trying to deactivate auth, please manually disable auth with the root auth token: {}" .format(root_auth_token)) raise finally: client.deactivate_enterprise()
def main(): client = python_pachyderm.Client() client.create_pipeline( pipeline_name="producer", transform=python_pachyderm.Transform( cmd=["python3", "/app/main.py"], image="ysimonson/pachyderm_spout_producer", ), spout=python_pachyderm.Spout( overwrite=False, marker="marker", ), ) python_pachyderm.create_python_pipeline( client, relpath("consumer"), input=python_pachyderm.Input( pfs=python_pachyderm.PFSInput(glob="/", repo="producer")), )