def tarball_extract(tarball, path): """Retrieve a tarball from Keep and extract it to a local directory. Return the absolute path where the tarball was extracted. If the top level of the tarball contained just one file or directory, return the absolute path of that single item. tarball -- collection locator path -- where to extract the tarball: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == tarball: already_have_it = True except OSError: pass if not already_have_it: # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) for f in CollectionReader(tarball).all_files(): if re.search('\.(tbz|tar.bz2)$', f.name()): p = tar_extractor(path, 'j') elif re.search('\.(tgz|tar.gz)$', f.name()): p = tar_extractor(path, 'z') elif re.search('\.tar$', f.name()): p = tar_extractor(path, '') else: raise arvados.errors.AssertionError( "tarball_extract cannot handle filename %s" % f.name()) while True: buf = f.read(2**20) if len(buf) == 0: break p.stdin.write(buf) p.stdin.close() p.wait() if p.returncode != 0: lockfile.close() raise arvados.errors.CommandFailedError("tar exited %d" % p.returncode) os.symlink(tarball, os.path.join(path, '.locator')) tld_extracts = [f for f in os.listdir(path) if f != '.locator'] lockfile.close() if len(tld_extracts) == 1: return os.path.join(path, tld_extracts[0]) return path
def main(arguments=None): """Docker image format migration tool for Arvados. This converts Docker images stored in Arvados from image format v1 (Docker <= 1.9) to image format v2 (Docker >= 1.10). Requires Docker running on the local host. Usage: 1) Run arvados/docker/migrate-docker19/build.sh to create arvados/migrate-docker19 Docker image. 2) Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate. 3) Run arv-migrate-docker19 from the Arvados Python SDK on the host (not in a container). This will query Arvados for v1 format Docker images. For each image that does not already have a corresponding v2 format image (as indicated by a docker_image_migration tag) it will perform the following process: i) download the image from Arvados ii) load it into Docker iii) update the Docker version, which updates the image iv) save the v2 format image and upload to Arvados v) create a migration link """ migrate19_parser = argparse.ArgumentParser() migrate19_parser.add_argument( '--version', action='version', version="%s %s" % (sys.argv[0], __version__), help='Print version and exit.') migrate19_parser.add_argument( '--verbose', action="store_true", help="Print stdout/stderr even on success") migrate19_parser.add_argument( '--force', action="store_true", help="Try to migrate even if there isn't enough space") migrate19_parser.add_argument( '--storage-driver', type=str, default="overlay", help="Docker storage driver, e.g. aufs, overlay, vfs") exgroup = migrate19_parser.add_mutually_exclusive_group() exgroup.add_argument( '--dry-run', action='store_true', help="Print number of pending migrations.") exgroup.add_argument( '--print-unmigrated', action='store_true', default=False, help="Print list of images needing migration.") migrate19_parser.add_argument('--tempdir', help="Set temporary directory") migrate19_parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=None, help="List of images to be migrated") args = migrate19_parser.parse_args(arguments) if args.tempdir: tempfile.tempdir = args.tempdir if args.verbose: logger.setLevel(logging.DEBUG) only_migrate = None if args.infile: only_migrate = set() for l in args.infile: only_migrate.add(l.strip()) api_client = arvados.api() user = api_client.users().current().execute() if not user['is_admin']: raise Exception("This command requires an admin token") sys_uuid = user['uuid'][:12] + '000000000000000' images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3) is_new = lambda img: img['dockerhash'].startswith('sha256:') count_new = 0 old_images = [] for uuid, img in images: if img["dockerhash"].startswith("sha256:"): continue key = (img["repo"], img["tag"], img["timestamp"]) old_images.append(img) migration_links = arvados.util.list_all(api_client.links().list, filters=[ ['link_class', '=', _migration_link_class], ['name', '=', _migration_link_name], ]) already_migrated = set() for m in migration_links: already_migrated.add(m["tail_uuid"]) items = arvados.util.list_all(api_client.collections().list, filters=[["uuid", "in", [img["collection"] for img in old_images]]], select=["uuid", "portable_data_hash", "manifest_text", "owner_uuid"]) uuid_to_collection = {i["uuid"]: i for i in items} need_migrate = {} totalbytes = 0 biggest = 0 biggest_pdh = None for img in old_images: i = uuid_to_collection[img["collection"]] pdh = i["portable_data_hash"] if pdh not in already_migrated and pdh not in need_migrate and (only_migrate is None or pdh in only_migrate): need_migrate[pdh] = img with CollectionReader(i["manifest_text"]) as c: size = list(c.values())[0].size() if size > biggest: biggest = size biggest_pdh = pdh totalbytes += size if args.storage_driver == "vfs": will_need = (biggest*20) else: will_need = (biggest*2.5) if args.print_unmigrated: only_migrate = set() for pdh in need_migrate: print(pdh) return logger.info("Already migrated %i images", len(already_migrated)) logger.info("Need to migrate %i images", len(need_migrate)) logger.info("Using tempdir %s", tempfile.gettempdir()) logger.info("Biggest image %s is about %i MiB", biggest_pdh, biggest>>20) logger.info("Total data to migrate about %i MiB", totalbytes>>20) df_out = subprocess.check_output(["df", "-B1", tempfile.gettempdir()]) ln = df_out.splitlines()[1] filesystem, blocks, used, available, use_pct, mounted = re.match(r"^([^ ]+) *([^ ]+) *([^ ]+) *([^ ]+) *([^ ]+) *([^ ]+)", ln).groups(1) if int(available) <= will_need: logger.warn("Temp filesystem mounted at %s does not have enough space for biggest image (has %i MiB, needs %i MiB)", mounted, int(available)>>20, will_need>>20) if not args.force: exit(1) else: logger.warn("--force provided, will migrate anyway") if args.dry_run: return success = [] failures = [] count = 1 for old_image in list(need_migrate.values()): if uuid_to_collection[old_image["collection"]]["portable_data_hash"] in already_migrated: continue oldcol = CollectionReader(uuid_to_collection[old_image["collection"]]["manifest_text"]) tarfile = list(oldcol.keys())[0] logger.info("[%i/%i] Migrating %s:%s (%s) (%i MiB)", count, len(need_migrate), old_image["repo"], old_image["tag"], old_image["collection"], list(oldcol.values())[0].size()>>20) count += 1 start = time.time() varlibdocker = tempfile.mkdtemp() dockercache = tempfile.mkdtemp() try: with tempfile.NamedTemporaryFile() as envfile: envfile.write("ARVADOS_API_HOST=%s\n" % (arvados.config.get("ARVADOS_API_HOST"))) envfile.write("ARVADOS_API_TOKEN=%s\n" % (arvados.config.get("ARVADOS_API_TOKEN"))) if arvados.config.get("ARVADOS_API_HOST_INSECURE"): envfile.write("ARVADOS_API_HOST_INSECURE=%s\n" % (arvados.config.get("ARVADOS_API_HOST_INSECURE"))) envfile.flush() dockercmd = ["docker", "run", "--privileged", "--rm", "--env-file", envfile.name, "--volume", "%s:/var/lib/docker" % varlibdocker, "--volume", "%s:/root/.cache/arvados/docker" % dockercache, "arvados/migrate-docker19:1.0", "/root/migrate.sh", "%s/%s" % (old_image["collection"], tarfile), tarfile[0:40], old_image["repo"], old_image["tag"], uuid_to_collection[old_image["collection"]]["owner_uuid"], args.storage_driver] proc = subprocess.Popen(dockercmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() initial_space = re.search(r"Initial available space is (\d+)", out) imgload_space = re.search(r"Available space after image load is (\d+)", out) imgupgrade_space = re.search(r"Available space after image upgrade is (\d+)", out) keepdocker_space = re.search(r"Available space after arv-keepdocker is (\d+)", out) cleanup_space = re.search(r"Available space after cleanup is (\d+)", out) if initial_space: isp = int(initial_space.group(1)) logger.info("Available space initially: %i MiB", (isp)/(2**20)) if imgload_space: sp = int(imgload_space.group(1)) logger.debug("Used after load: %i MiB", (isp-sp)/(2**20)) if imgupgrade_space: sp = int(imgupgrade_space.group(1)) logger.debug("Used after upgrade: %i MiB", (isp-sp)/(2**20)) if keepdocker_space: sp = int(keepdocker_space.group(1)) logger.info("Used after upload: %i MiB", (isp-sp)/(2**20)) if cleanup_space: sp = int(cleanup_space.group(1)) logger.debug("Available after cleanup: %i MiB", (sp)/(2**20)) if proc.returncode != 0: logger.error("Failed with return code %i", proc.returncode) logger.error("--- Stdout ---\n%s", out) logger.error("--- Stderr ---\n%s", err) raise MigrationFailed() if args.verbose: logger.info("--- Stdout ---\n%s", out) logger.info("--- Stderr ---\n%s", err) migrated = re.search(r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out) if migrated: newcol = CollectionReader(migrated.group(1)) api_client.links().create(body={"link": { 'owner_uuid': sys_uuid, 'link_class': _migration_link_class, 'name': _migration_link_name, 'tail_uuid': oldcol.portable_data_hash(), 'head_uuid': newcol.portable_data_hash() }}).execute(num_retries=3) logger.info("Migrated '%s' (%s) to '%s' (%s) in %is", oldcol.portable_data_hash(), old_image["collection"], newcol.portable_data_hash(), migrated.group(1), time.time() - start) already_migrated.add(oldcol.portable_data_hash()) success.append(old_image["collection"]) else: logger.error("Error migrating '%s'", old_image["collection"]) failures.append(old_image["collection"]) except Exception as e: logger.error("Failed to migrate %s in %is", old_image["collection"], time.time() - start, exc_info=(not isinstance(e, MigrationFailed))) failures.append(old_image["collection"]) finally: shutil.rmtree(varlibdocker) shutil.rmtree(dockercache) logger.info("Successfully migrated %i images", len(success)) if failures: logger.error("Failed to migrate %i images", len(failures))
def zipball_extract(zipball, path): """Retrieve a zip archive from Keep and extract it to a local directory. Return the absolute path where the archive was extracted. If the top level of the archive contained just one file or directory, return the absolute path of that single item. zipball -- collection locator path -- where to extract the archive: absolute, or relative to job tmp """ if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == zipball: already_have_it = True except OSError: pass if not already_have_it: # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) for f in CollectionReader(zipball).all_files(): if not re.search('\.zip$', f.name()): raise arvados.errors.NotImplementedError( "zipball_extract cannot handle filename %s" % f.name()) zip_filename = os.path.join(path, os.path.basename(f.name())) zip_file = open(zip_filename, 'wb') while True: buf = f.read(2**20) if len(buf) == 0: break zip_file.write(buf) zip_file.close() p = subprocess.Popen( ["unzip", "-q", "-o", "-d", path, zip_filename], stdout=None, stdin=None, stderr=sys.stderr, shell=False, close_fds=True) p.wait() if p.returncode != 0: lockfile.close() raise arvados.errors.CommandFailedError("unzip exited %d" % p.returncode) os.unlink(zip_filename) os.symlink(zipball, os.path.join(path, '.locator')) tld_extracts = [f for f in os.listdir(path) if f != '.locator'] lockfile.close() if len(tld_extracts) == 1: return os.path.join(path, tld_extracts[0]) return path
def collection_extract(collection, path, files=[], decompress=True): """Retrieve a collection from Keep and extract it to a local directory. Return the absolute path where the collection was extracted. collection -- collection locator path -- where to extract: absolute, or relative to job tmp """ matches = re.search(r'^([0-9a-f]+)(\+[\w@]+)*$', collection) if matches: collection_hash = matches.group(1) else: collection_hash = hashlib.md5(collection).hexdigest() if not re.search('^/', path): path = os.path.join(arvados.current_job().tmpdir, path) lockfile = open(path + '.lock', 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) try: os.stat(path) except OSError: os.mkdir(path) already_have_it = False try: if os.readlink(os.path.join(path, '.locator')) == collection_hash: already_have_it = True except OSError: pass # emulate "rm -f" (i.e., if the file does not exist, we win) try: os.unlink(os.path.join(path, '.locator')) except OSError: if os.path.exists(os.path.join(path, '.locator')): os.unlink(os.path.join(path, '.locator')) files_got = [] for s in CollectionReader(collection).all_streams(): stream_name = s.name() for f in s.all_files(): if (files == [] or ((f.name() not in files_got) and (f.name() in files or (decompress and f.decompressed_name() in files)))): outname = f.decompressed_name() if decompress else f.name() files_got += [outname] if os.path.exists(os.path.join(path, stream_name, outname)): continue mkdir_dash_p( os.path.dirname(os.path.join(path, stream_name, outname))) outfile = open(os.path.join(path, stream_name, outname), 'wb') for buf in (f.readall_decompressed() if decompress else f.readall()): outfile.write(buf) outfile.close() if len(files_got) < len(files): raise arvados.errors.AssertionError( "Wanted files %s but only got %s from %s" % (files, files_got, [z.name() for z in CollectionReader(collection).all_files()])) os.symlink(collection_hash, os.path.join(path, '.locator')) lockfile.close() return path
def main(arguments=None): """Docker image format migration tool for Arvados. This converts Docker images stored in Arvados from image format v1 (Docker <= 1.9) to image format v2 (Docker >= 1.10). Requires Docker running on the local host. Usage: 1) Run arvados/docker/migrate-docker19/build.sh to create arvados/migrate-docker19 Docker image. 2) Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate. 3) Run arv-migrate-docker19 from the Arvados Python SDK on the host (not in a container). This will query Arvados for v1 format Docker images. For each image that does not already have a corresponding v2 format image (as indicated by a docker_image_migration tag) it will perform the following process: i) download the image from Arvados ii) load it into Docker iii) update the Docker version, which updates the image iv) save the v2 format image and upload to Arvados v) create a migration link """ migrate19_parser = argparse.ArgumentParser() migrate19_parser.add_argument('--version', action='version', version="%s %s" % (sys.argv[0], __version__), help='Print version and exit.') migrate19_parser.add_argument('--verbose', action="store_true", help="Print stdout/stderr even on success") migrate19_parser.add_argument( '--force', action="store_true", help="Try to migrate even if there isn't enough space") migrate19_parser.add_argument( '--storage-driver', type=str, default="overlay", help="Docker storage driver, e.g. aufs, overlay, vfs") exgroup = migrate19_parser.add_mutually_exclusive_group() exgroup.add_argument('--dry-run', action='store_true', help="Print number of pending migrations.") exgroup.add_argument('--print-unmigrated', action='store_true', default=False, help="Print list of images needing migration.") migrate19_parser.add_argument('--tempdir', help="Set temporary directory") migrate19_parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=None, help="List of images to be migrated") args = migrate19_parser.parse_args(arguments) if args.tempdir: tempfile.tempdir = args.tempdir if args.verbose: logger.setLevel(logging.DEBUG) only_migrate = None if args.infile: only_migrate = set() for l in args.infile: only_migrate.add(l.strip()) api_client = arvados.api() user = api_client.users().current().execute() if not user['is_admin']: raise Exception("This command requires an admin token") sys_uuid = user['uuid'][:12] + '000000000000000' images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3) is_new = lambda img: img['dockerhash'].startswith('sha256:') count_new = 0 old_images = [] for uuid, img in images: if img["dockerhash"].startswith("sha256:"): continue key = (img["repo"], img["tag"], img["timestamp"]) old_images.append(img) migration_links = arvados.util.list_all( api_client.links().list, filters=[ ['link_class', '=', _migration_link_class], ['name', '=', _migration_link_name], ]) already_migrated = set() for m in migration_links: already_migrated.add(m["tail_uuid"]) items = arvados.util.list_all( api_client.collections().list, filters=[["uuid", "in", [img["collection"] for img in old_images]]], select=["uuid", "portable_data_hash", "manifest_text", "owner_uuid"]) uuid_to_collection = {i["uuid"]: i for i in items} need_migrate = {} totalbytes = 0 biggest = 0 biggest_pdh = None for img in old_images: i = uuid_to_collection[img["collection"]] pdh = i["portable_data_hash"] if pdh not in already_migrated and pdh not in need_migrate and ( only_migrate is None or pdh in only_migrate): need_migrate[pdh] = img with CollectionReader(i["manifest_text"]) as c: size = list(c.values())[0].size() if size > biggest: biggest = size biggest_pdh = pdh totalbytes += size if args.storage_driver == "vfs": will_need = (biggest * 20) else: will_need = (biggest * 2.5) if args.print_unmigrated: only_migrate = set() for pdh in need_migrate: print(pdh) return logger.info("Already migrated %i images", len(already_migrated)) logger.info("Need to migrate %i images", len(need_migrate)) logger.info("Using tempdir %s", tempfile.gettempdir()) logger.info("Biggest image %s is about %i MiB", biggest_pdh, biggest >> 20) logger.info("Total data to migrate about %i MiB", totalbytes >> 20) df_out = subprocess.check_output(["df", "-B1", tempfile.gettempdir()]) ln = df_out.splitlines()[1] filesystem, blocks, used, available, use_pct, mounted = re.match( r"^([^ ]+) *([^ ]+) *([^ ]+) *([^ ]+) *([^ ]+) *([^ ]+)", ln).groups(1) if int(available) <= will_need: logger.warn( "Temp filesystem mounted at %s does not have enough space for biggest image (has %i MiB, needs %i MiB)", mounted, int(available) >> 20, will_need >> 20) if not args.force: exit(1) else: logger.warn("--force provided, will migrate anyway") if args.dry_run: return success = [] failures = [] count = 1 for old_image in list(need_migrate.values()): if uuid_to_collection[old_image["collection"]][ "portable_data_hash"] in already_migrated: continue oldcol = CollectionReader( uuid_to_collection[old_image["collection"]]["manifest_text"]) tarfile = list(oldcol.keys())[0] logger.info("[%i/%i] Migrating %s:%s (%s) (%i MiB)", count, len(need_migrate), old_image["repo"], old_image["tag"], old_image["collection"], list(oldcol.values())[0].size() >> 20) count += 1 start = time.time() varlibdocker = tempfile.mkdtemp() dockercache = tempfile.mkdtemp() try: with tempfile.NamedTemporaryFile() as envfile: envfile.write("ARVADOS_API_HOST=%s\n" % (arvados.config.get("ARVADOS_API_HOST"))) envfile.write("ARVADOS_API_TOKEN=%s\n" % (arvados.config.get("ARVADOS_API_TOKEN"))) if arvados.config.get("ARVADOS_API_HOST_INSECURE"): envfile.write( "ARVADOS_API_HOST_INSECURE=%s\n" % (arvados.config.get("ARVADOS_API_HOST_INSECURE"))) envfile.flush() dockercmd = [ "docker", "run", "--privileged", "--rm", "--env-file", envfile.name, "--volume", "%s:/var/lib/docker" % varlibdocker, "--volume", "%s:/root/.cache/arvados/docker" % dockercache, "arvados/migrate-docker19:1.0", "/root/migrate.sh", "%s/%s" % (old_image["collection"], tarfile), tarfile[0:40], old_image["repo"], old_image["tag"], uuid_to_collection[old_image["collection"]]["owner_uuid"], args.storage_driver ] proc = subprocess.Popen(dockercmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() initial_space = re.search(r"Initial available space is (\d+)", out) imgload_space = re.search( r"Available space after image load is (\d+)", out) imgupgrade_space = re.search( r"Available space after image upgrade is (\d+)", out) keepdocker_space = re.search( r"Available space after arv-keepdocker is (\d+)", out) cleanup_space = re.search( r"Available space after cleanup is (\d+)", out) if initial_space: isp = int(initial_space.group(1)) logger.info("Available space initially: %i MiB", (isp) / (2**20)) if imgload_space: sp = int(imgload_space.group(1)) logger.debug("Used after load: %i MiB", (isp - sp) / (2**20)) if imgupgrade_space: sp = int(imgupgrade_space.group(1)) logger.debug("Used after upgrade: %i MiB", (isp - sp) / (2**20)) if keepdocker_space: sp = int(keepdocker_space.group(1)) logger.info("Used after upload: %i MiB", (isp - sp) / (2**20)) if cleanup_space: sp = int(cleanup_space.group(1)) logger.debug("Available after cleanup: %i MiB", (sp) / (2**20)) if proc.returncode != 0: logger.error("Failed with return code %i", proc.returncode) logger.error("--- Stdout ---\n%s", out) logger.error("--- Stderr ---\n%s", err) raise MigrationFailed() if args.verbose: logger.info("--- Stdout ---\n%s", out) logger.info("--- Stderr ---\n%s", err) migrated = re.search( r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out) if migrated: newcol = CollectionReader(migrated.group(1)) api_client.links().create( body={ "link": { 'owner_uuid': sys_uuid, 'link_class': _migration_link_class, 'name': _migration_link_name, 'tail_uuid': oldcol.portable_data_hash(), 'head_uuid': newcol.portable_data_hash() } }).execute(num_retries=3) logger.info("Migrated '%s' (%s) to '%s' (%s) in %is", oldcol.portable_data_hash(), old_image["collection"], newcol.portable_data_hash(), migrated.group(1), time.time() - start) already_migrated.add(oldcol.portable_data_hash()) success.append(old_image["collection"]) else: logger.error("Error migrating '%s'", old_image["collection"]) failures.append(old_image["collection"]) except Exception as e: logger.error("Failed to migrate %s in %is", old_image["collection"], time.time() - start, exc_info=(not isinstance(e, MigrationFailed))) failures.append(old_image["collection"]) finally: shutil.rmtree(varlibdocker) shutil.rmtree(dockercache) logger.info("Successfully migrated %i images", len(success)) if failures: logger.error("Failed to migrate %i images", len(failures))
def main(fastq_project, workflows_project, metagenome_workflow_uuid, pangenome_workflow_uuid, pangenome_result_col_uuid): logging.info("Starting a analysis run") api = arvados.api('v1', host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN) col = arvados.collection.Collection(api_client=api) state = {} if os.path.exists('state.json'): state = json.loads(open('state.json').read()) reads = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", fastq_project]]) pangenome_data = [] report_data = {'kraken': [], 'mlst': [], 'resistome': [], 'virulome': [], 'prokka': []} update_pangenome = False for it in reads[1:]: col = api.collections().get(uuid=it['uuid']).execute() if 'sequence_label' not in it['properties']: continue sample_id = it['properties']['sequence_label'] if 'analysis_status' in it['properties']: pangenome_data.append((sample_id, col['portable_data_hash'])) col_reader = CollectionReader(col['uuid']) report_data['kraken'].append((sample_id, get_kraken_report(col_reader))) report_data['mlst'].append((sample_id, get_mlst_report(col_reader))) report_data['resistome'].append((sample_id, get_resistome_report(col_reader))) report_data['virulome'].append((sample_id, get_virulome_report(col_reader))) report_data['prokka'].append((sample_id, get_prokka_report(col_reader))) continue if sample_id not in state: state[sample_id] = { 'status': 'new', 'container_request': None, 'output_collection': None, } sample_state = state[sample_id] if sample_state['status'] == 'new': container_request, status = submit_new_request( api, workflows_project, metagenome_workflow_uuid, sample_id, it['portable_data_hash']) sample_state['status'] = status sample_state['container_request'] = container_request logging.info('Submitted analysis request for %s', sample_id) elif sample_state['status'] == 'submitted': # TODO: check container request status if sample_state['container_request'] is None: raise Exception("Container request cannot be empty when status is submitted") cr = api.container_requests().get( uuid=sample_state["container_request"]).execute() cr_state = get_cr_state(api, cr) logging.info('Container request for %s is %s', sample_id, cr_state) if cr_state == 'Complete': out_col = api.collections().get(uuid=cr["output_uuid"]).execute() sample_state['output_collection'] = cr["output_uuid"] sample_state['status'] = 'complete' # Copy output files to reads collection it['properties']['analysis_status'] = 'complete' api.collections().update( uuid=it['uuid'], body={"manifest_text": col["manifest_text"] + out_col["manifest_text"], "properties": it["properties"]}).execute() pangenome_data.append((sample_id, col['portable_data_hash'])) update_pangenome = True elif cr_state == 'Failed': state[sample_id] = { 'status': 'new', 'container_request': None, 'output_collection': None, } elif sample_state['status'] == 'complete': # TODO: do nothing pass if update_pangenome: container_request, status = submit_pangenome(api, workflows_project, pangenome_workflow_uuid, pangenome_data) if status == 'submitted': state['last_pangenome_request'] = container_request state['last_pangenome_request_status'] = 'submitted' logging.info('Submitted pangenome request %s', container_request) else: cr = api.container_requests().get( uuid=state["last_pangenome_request"]).execute() cr_state = get_cr_state(api, cr) logging.info('Container request for pangenome workflow is %s', cr_state) if state['last_pangenome_request_status'] == 'submitted' and cr_state == 'Complete': logging.info('Updating results collection') out_col = api.collections().get(uuid=cr["output_uuid"]).execute() api.collections().update( uuid=pangenome_result_col_uuid, body={"manifest_text": out_col["manifest_text"]}).execute() state['last_pangenome_request_status'] = 'complete' col_reader = CollectionReader(pangenome_result_col_uuid) report_data["iqtree"] = get_iqtree_result(col_reader) report_data["roary_svg"] = get_roary_svg(col_reader) report_data["roary_stats"] = get_roary_stats(col_reader) generate_report(report_data) with open('state.json', 'w') as f: f.write(json.dumps(state))
def main(arguments=None): """Docker image format migration tool for Arvados. This converts Docker images stored in Arvados from image format v1 (Docker <= 1.9) to image format v2 (Docker >= 1.10). Requires Docker running on the local host. Usage: 1) Run arvados/docker/migrate-docker19/build.sh to create arvados/migrate-docker19 Docker image. 2) Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate. 3) Run arv-migrate-docker19 from the Arvados Python SDK on the host (not in a container). This will query Arvados for v1 format Docker images. For each image that does not already have a corresponding v2 format image (as indicated by a docker_image_migration tag) it will perform the following process: i) download the image from Arvados ii) load it into Docker iii) update the Docker version, which updates the image iv) save the v2 format image and upload to Arvados v) create a migration link """ migrate19_parser = argparse.ArgumentParser() migrate19_parser.add_argument('--version', action='version', version="%s %s" % (sys.argv[0], __version__), help='Print version and exit.') exgroup = migrate19_parser.add_mutually_exclusive_group() exgroup.add_argument('--dry-run', action='store_true', help="Print number of pending migrations.") exgroup.add_argument('--print-unmigrated', action='store_true', default=False, help="Print list of images needing migration.") migrate19_parser.add_argument('--tempdir', help="Set temporary directory") migrate19_parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=None, help="List of images to be migrated") args = migrate19_parser.parse_args(arguments) if args.tempdir: tempfile.tempdir = args.tempdir only_migrate = None if args.infile: only_migrate = set() for l in args.infile: only_migrate.add(l.strip()) api_client = arvados.api() user = api_client.users().current().execute() if not user['is_admin']: raise Exception("This command requires an admin token") sys_uuid = user['uuid'][:12] + '000000000000000' images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3) is_new = lambda img: img['dockerhash'].startswith('sha256:') count_new = 0 old_images = [] for uuid, img in images: if img["dockerhash"].startswith("sha256:"): continue key = (img["repo"], img["tag"], img["timestamp"]) old_images.append(img) migration_links = arvados.util.list_all( api_client.links().list, filters=[ ['link_class', '=', _migration_link_class], ['name', '=', _migration_link_name], ]) already_migrated = set() for m in migration_links: already_migrated.add(m["tail_uuid"]) items = arvados.util.list_all( api_client.collections().list, filters=[["uuid", "in", [img["collection"] for img in old_images]]], select=["uuid", "portable_data_hash", "manifest_text", "owner_uuid"]) uuid_to_collection = {i["uuid"]: i for i in items} need_migrate = {} biggest = 0 for img in old_images: i = uuid_to_collection[img["collection"]] pdh = i["portable_data_hash"] if pdh not in already_migrated and (only_migrate is None or pdh in only_migrate): need_migrate[pdh] = img with CollectionReader(i["manifest_text"]) as c: if c.values()[0].size() > biggest: biggest = c.values()[0].size() if args.print_unmigrated: only_migrate = set() for pdh in need_migrate: print pdh return logger.info("Already migrated %i images", len(already_migrated)) logger.info("Need to migrate %i images", len(need_migrate)) logger.info("Using tempdir %s", tempfile.gettempdir()) logger.info( "Biggest image is about %i MiB, tempdir needs at least %i MiB free", biggest / (2**20), (biggest * 2) / (2**20)) if args.dry_run: return success = [] failures = [] count = 1 for old_image in need_migrate.values(): if uuid_to_collection[old_image["collection"]][ "portable_data_hash"] in already_migrated: continue oldcol = CollectionReader( uuid_to_collection[old_image["collection"]]["manifest_text"]) tarfile = oldcol.keys()[0] logger.info("[%i/%i] Migrating %s:%s (%s) (%i MiB)", count, len(need_migrate), old_image["repo"], old_image["tag"], old_image["collection"], oldcol.values()[0].size() / (2**20)) count += 1 start = time.time() varlibdocker = tempfile.mkdtemp() dockercache = tempfile.mkdtemp() try: with tempfile.NamedTemporaryFile() as envfile: envfile.write("ARVADOS_API_HOST=%s\n" % (os.environ["ARVADOS_API_HOST"])) envfile.write("ARVADOS_API_TOKEN=%s\n" % (os.environ["ARVADOS_API_TOKEN"])) if "ARVADOS_API_HOST_INSECURE" in os.environ: envfile.write("ARVADOS_API_HOST_INSECURE=%s\n" % (os.environ["ARVADOS_API_HOST_INSECURE"])) envfile.flush() dockercmd = [ "docker", "run", "--privileged", "--rm", "--env-file", envfile.name, "--volume", "%s:/var/lib/docker" % varlibdocker, "--volume", "%s:/root/.cache/arvados/docker" % dockercache, "arvados/migrate-docker19", "/root/migrate.sh", "%s/%s" % (old_image["collection"], tarfile), tarfile[0:40], old_image["repo"], old_image["tag"], uuid_to_collection[old_image["collection"]]["owner_uuid"] ] proc = subprocess.Popen(dockercmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() if proc.returncode != 0: logger.error("Failed with return code %i", proc.returncode) logger.error("--- Stdout ---\n%s", out) logger.error("--- Stderr ---\n%s", err) raise MigrationFailed() migrated = re.search( r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out) if migrated: newcol = CollectionReader(migrated.group(1)) api_client.links().create( body={ "link": { 'owner_uuid': sys_uuid, 'link_class': _migration_link_class, 'name': _migration_link_name, 'tail_uuid': oldcol.portable_data_hash(), 'head_uuid': newcol.portable_data_hash() } }).execute(num_retries=3) logger.info("Migrated '%s' (%s) to '%s' (%s) in %is", oldcol.portable_data_hash(), old_image["collection"], newcol.portable_data_hash(), migrated.group(1), time.time() - start) already_migrated.add(oldcol.portable_data_hash()) success.append(old_image["collection"]) else: logger.error("Error migrating '%s'", old_image["collection"]) failures.append(old_image["collection"]) except Exception as e: logger.error("Failed to migrate %s in %is", old_image["collection"], time.time() - start, exc_info=(not isinstance(e, MigrationFailed))) failures.append(old_image["collection"]) finally: shutil.rmtree(varlibdocker) shutil.rmtree(dockercache) logger.info("Successfully migrated %i images", len(success)) if failures: logger.error("Failed to migrate %i images", len(failures))
def main(fastq_project, workflows_project, metagenome_workflow_uuid, pangenome_workflow_uuid, pangenome_result_col_uuid, fastq_result_project): logging.info("Starting a analysis run") api = arvados.api('v1', host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN) col = arvados.collection.Collection(api_client=api, num_retries=5) state = {} if os.path.exists('state.json'): state = json.loads(open('state.json').read()) reads = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", fastq_project]]) pangenome_data = [] report_data = { 'kraken': [], 'mlst': [], 'resistome': [], 'virulome': [], 'prokka': [] } update_pangenome = False proc_cnt = 0 bad_samples = set([ 'MRSA095', 'MRSA096', 'MRSA097', 'MRSA098', 'MRSA099', 'MRSA100', 'MRSA101', 'MRSA102', 'MRSA117', 'MRSA118', 'MRSA124', 'MRSA133', 'MRSA187', 'MRSA261', 'MRSA314', 'MRSA355', 'MRSA357', 'MRSA360', 'MRSA361', 'MRSA390', 'MRSA420', 'MRSA422', 'MRSA477', 'MRSA028', 'MRSA070', 'MRSA116', 'MRSA179', 'MRSA243', 'MRSA270', 'MRSA372', 'MRSA384', 'MRSA413', 'MRSA442', 'MRSA478', 'MRSA480', 'MRSA481', 'MRSA490', 'MRSA491', 'MRSA500', 'MRSA501', 'MRSA502', 'MRSA503', 'MRSA088', 'MRSA112', 'MRSA260', ]) try: for it in reads[1:]: col = api.collections().get(uuid=it['uuid']).execute() if 'sequence_label' not in it['properties']: continue sample_id = it['properties']['sequence_label'] if sample_id not in state: state[sample_id] = { 'status': 'new', 'container_request': None, 'output_collection': None, } sample_state = state[sample_id] if sample_state['status'] == 'complete': out_col = api.collections().get( uuid=sample_state['output_collection']).execute() if sample_id not in bad_samples: pangenome_data.append( (sample_id, out_col['portable_data_hash'])) col_reader = CollectionReader(out_col['uuid'], num_retries=5) # print('Saving contigs for', sample_id) # save_contigs(sample_id, col_reader) # continue report_data['kraken'].append( (sample_id, get_kraken_report(col_reader))) report_data['mlst'].append( (sample_id, get_mlst_report(col_reader))) report_data['resistome'].append( (sample_id, get_resistome_report(col_reader))) report_data['virulome'].append( (sample_id, get_virulome_report(col_reader))) report_data['prokka'].append( (sample_id, get_prokka_report(col_reader, sample_id))) if sample_state['status'] == 'new': if proc_cnt == 10: # Do not submit more than 10 jobs continue container_request, status = submit_new_request( api, workflows_project, metagenome_workflow_uuid, sample_id, it['portable_data_hash']) sample_state['status'] = status sample_state['container_request'] = container_request print(f'Submitted analysis request for {sample_id}') proc_cnt += 1 elif sample_state['status'] == 'submitted': # TODO: check container request status if sample_state['container_request'] is None: raise Exception( "Container request cannot be empty when status is submitted" ) try: cr = api.container_requests().get( uuid=sample_state["container_request"]).execute() cr_state = get_cr_state(api, cr) except Exception as e: print(e) cr_state = 'Failed' print(f'Container request for {sample_id} is {cr_state}') if cr_state == 'Complete': out_col = api.collections().get( uuid=cr["output_uuid"]).execute() sample_state['output_collection'] = cr["output_uuid"] sample_state['status'] = 'complete' # Copy output files to reads collection it['properties']['analysis_status'] = 'complete' res = api.collections().update(uuid=it['uuid'], body={ "properties": it["properties"] }).execute() # update_pangenome = True elif cr_state == 'Failed': state[sample_id] = { 'status': 'new', 'container_request': None, 'output_collection': None, } elif sample_state['status'] == 'complete': # TODO: do nothing pass if update_pangenome: container_request, status = submit_pangenome( api, workflows_project, pangenome_workflow_uuid, pangenome_data) if status == 'submitted': state['last_pangenome_request'] = container_request state['last_pangenome_request_status'] = 'submitted' print('Submitted pangenome request', container_request) else: cr = api.container_requests().get( uuid=state["last_pangenome_request"]).execute() cr_state = get_cr_state(api, cr) print(f'Container request for pangenome workflow is {cr_state}') if state[ 'last_pangenome_request_status'] == 'submitted' and cr_state == 'Complete': print('Updating results collection') out_col = api.collections().get( uuid=cr["output_uuid"]).execute() api.collections().update(uuid=pangenome_result_col_uuid, body={ "manifest_text": out_col["manifest_text"] }).execute() state['last_pangenome_request_status'] = 'complete' col_reader = CollectionReader(pangenome_result_col_uuid, num_retries=5) report_data["iqtree"] = get_iqtree_result(col_reader) report_data["roary_svg"] = get_roary_svg(col_reader) report_data["roary_stats"] = get_roary_stats(col_reader) snp_dists, hist_data = get_snp_dists(col_reader) report_data["snp_dists"] = snp_dists report_data["snp_hist"] = { 'nums': json.dumps(hist_data), 'start': 0, 'end': max(hist_data) } report_data["core"] = get_core_genome(col_reader) generate_report(report_data) except Exception as e: print(sample_state) traceback.print_exc() with open('state.json', 'w') as f: f.write(json.dumps(state))
def main(fastq_project): api = arvados.api('v1', host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN) col = arvados.collection.Collection(api_client=api, num_retries=5) state = {} if os.path.exists('state.json'): state = json.loads(open('state.json').read()) reads = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", fastq_project]]) pangenome_data = [] report_data = {'kraken': [], 'mlst': [], 'resistome': [], 'virulome': [], 'prokka': []} update_pangenome = True proc_cnt = 0 bad_samples = set([ 'MRSA095', 'MRSA096', 'MRSA097', 'MRSA098', 'MRSA099', 'MRSA100', 'MRSA101', 'MRSA102', 'MRSA117', 'MRSA118', 'MRSA124', 'MRSA133', 'MRSA187', 'MRSA261', 'MRSA314', 'MRSA355', 'MRSA357', 'MRSA360', 'MRSA361', 'MRSA390', 'MRSA420', 'MRSA422', 'MRSA477' ]) drug_names = {} with open('data/drugs.yml') as f: drugs = yaml.load(f, Loader=yaml.FullLoader) drugs = drugs['drugs'] for key, value in drugs.items(): drug_names[value] = key with open('uploader/options.yml') as f: options = yaml.load(f, Loader=yaml.FullLoader) drugs_list = options['antimicrobial_agent'] for key, value in drugs_list.items(): drug_names[value] = key drugs[key] = value try: all_drugs = set() fp = 0 tp = 0 genotype = {'samples': []} labs_resist = [] labs_sensitive = [] samples = [] for it in reads[1:]: col = api.collections().get(uuid=it['uuid']).execute() if 'sequence_label' not in it['properties']: continue sample_id = it['properties']['sequence_label'] if sample_id not in state: continue sample_state = state[sample_id] if sample_state['status'] == 'complete': out_col = api.collections().get( uuid=sample_state['output_collection']).execute() if sample_id not in bad_samples: col_reader = CollectionReader(out_col['uuid']) res_drugs = get_resistome_report(col_reader) header = res_drugs[0] for item in header: if item not in genotype: genotype[item] = [] for item in res_drugs[1:]: genotype['samples'].append(sample_id) for col, value in zip(header, item): genotype[col].append(value) # drug_ids = set([drug_names[drugs[x]] for x in res_drugs]) with open(f'/opt/data-mrsa/metadata/{sample_id}.yaml') as f: metadata = yaml.load(f, Loader=yaml.FullLoader) if not 'susceptibility' in metadata['phenotypes']: continue sus = metadata['phenotypes']['susceptibility'] meta_drugs = [] sens_drugs = [] for item in sus: if item['interpretation'] == 'http://purl.obolibrary.org/obo/PATO_0001178': meta_drugs.append((drug_names[item['antimicrobial_agent']], item['mic'])) else: sens_drugs.append((drug_names[item['antimicrobial_agent']], item['mic'])) samples.append(sample_id) labs_resist.append(meta_drugs) labs_sensitive.append(sens_drugs) df = pd.DataFrame({'samples': samples, 'resistance': labs_resist, 'sensitive': labs_sensitive}) df.to_pickle('data/lab_resistance.pkl') df = pd.DataFrame(genotype) df.to_pickle('data/gt_resistance.pkl') # print(all_drugs) # drugs = [] # for d in all_drugs: # drugs.append({d: 'http://purl.obolibrary.org/obo/CHEBI_18208'}) # with open('data/drugs.yml', 'w') as w: # yaml.dump({'drugs': drugs}, w) except Exception as e: print(sample_state) traceback.print_exc()