def emmet(spec, run, issue, sbatch, bb, yes, no_dupe_check, verbose): """Command line interface for emmet""" logger.setLevel(logging.DEBUG if verbose else logging.INFO) ctx = click.get_current_context() ctx.ensure_object(dict) if not sbatch and bb: raise EmmetCliError("Burst buffer only available in SBatch mode (--sbatch).") if spec: client = calcdb_from_mgrant(spec) ctx.obj["CLIENT"] = client # ctx.obj["MONGO_HANDLER"] = BufferedMongoHandler( # host=client.host, # port=client.port, # database_name=client.db_name, # username=client.user, # password=client.password, # level=logging.WARNING, # authentication_db=client.db_name, # collection="emmet_logs", # buffer_periodical_flush_timing=False, # flush manually # ) # logger.addHandler(ctx.obj["MONGO_HANDLER"]) # coll = ctx.obj["MONGO_HANDLER"].collection # ensure_indexes(SETTINGS.log_fields, [coll]) if run: if not issue: raise EmmetCliError(f"Need issue number via --issue!") ctx.obj["LOG_STREAM"] = StringIO() memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"]) formatter = logging.Formatter( "%(asctime)s %(name)-12s %(levelname)-8s %(message)s" ) memory_handler.setFormatter(formatter) logger.addHandler(memory_handler) CREDENTIALS = os.path.join(os.path.expanduser("~"), ".emmet_credentials") if not os.path.exists(CREDENTIALS): user = click.prompt("GitHub Username") password = click.prompt("GitHub Password", hide_input=True) auth = authorize( user, password, ["user", "repo", "gist"], "emmet CLI", two_factor_callback=opt_prompt, ) with open(CREDENTIALS, "w") as fd: fd.write(auth.token) with open(CREDENTIALS, "r") as fd: token = fd.readline().strip() ctx.obj["GH"] = login(token=token) else: click.secho("DRY RUN! Add --run flag to execute changes.", fg="green") install_mp_handler(logger=logger)
def check_pattern(nested_allowed=False): ctx = click.get_current_context() pattern = ctx.parent.params["pattern"] if not nested_allowed and os.sep in pattern: raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!") elif not pattern.startswith(PREFIX): raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with {PREFIX}!")
def get_format(fname): if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"): return "cif" elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"): return "json" else: raise EmmetCliError(f"reading {fname} not supported (yet)")
def wrapper(*args, **kwargs): ret = func(*args, **kwargs) ctx = click.get_current_context() if not isinstance(ret, ReturnCodes): raise EmmetCliError(f"Tracking `{ctx.command_path}` requires ReturnCode!") if ctx.grand_parent.params["run"]: logger.info(ret.value) gh = ctx.grand_parent.obj["GH"] user = gh.me().login issue_number = ctx.grand_parent.params["issue"] issue = gh.issue( SETTINGS.tracker["org"], SETTINGS.tracker["repo"], issue_number ) # gists iterator/resource based on latest etag ETAG = os.path.join(os.path.expanduser("~"), ".emmet_etag") etag = None if os.path.exists(ETAG): with open(ETAG, "r") as fd: etag = fd.readline().strip() gists_iterator = gh.gists(etag=etag) if gists_iterator.etag != etag: with open(ETAG, "w") as fd: fd.write(gists_iterator.etag) # create or retrieve gist for log files gist_name = f"#{issue_number}-{SETTINGS.tracker['repo']}.md" for gist in gists_iterator: if gist.files and gist_name in gist.files: break else: description = f"Logs for {SETTINGS.tracker['repo']}#{issue_number}" files = {gist_name: {"content": issue.html_url}} gist = gh.create_gist(description, files, public=False) zip_base = gist.html_url.replace(gist.id, user + "/" + gist.id) txt = GIST_COMMENT_TEMPLATE.format( gist.html_url, zip_base + "/archive/master.zip" ) comment = issue.create_comment(txt) logger.info(f"Gist Comment: {comment.html_url}") # update gist with logs for new command logger.info(f"Log Gist: {gist.html_url}") now = str(datetime.now()).replace(" ", "-") filename = ctx.command_path.replace(" ", "-") + f"_{now}" logs = ctx.grand_parent.obj["LOG_STREAM"] gist.edit(files={filename: {"content": logs.getvalue()}}) if not ctx.grand_parent.params["sbatch"]: # add comment for new command command = reconstruct_command() raw_url = f"{GIST_RAW_URL}/{user}/{gist.id}/raw/{filename}" txt = COMMENT_TEMPLATE.format( ctx.command_path, ret.value, command, raw_url ) comment = issue.create_comment(txt) logger.info(comment.html_url)
def count_file_documents(file_obj): """Counts how many documents provided BSON file contains""" cnt = 0 while True: # Read size of next object. size_data = file_obj.read(4) if len(size_data) == 0: break # Finished with file normally. elif len(size_data) != 4: raise EmmetCliError("Invalid BSON: cut off in middle of objsize") obj_size = _UNPACK_INT(size_data)[0] - 4 file_obj.seek(obj_size, os.SEEK_CUR) cnt += 1 file_obj.seek(0) return cnt
def calc(ctx, specs, nmax, skip): """Set up calculations to optimize structures using VASP""" if "CLIENT" not in ctx.obj: raise EmmetCliError("--spec option required with calc sub-command!") collections = {} for coll in [ctx.obj["CLIENT"].db.snls, ctx.obj["CLIENT"].db.tasks]: collections[coll.full_name] = coll # user collections for spec in specs: client = calcdb_from_mgrant(spec) names = client.db.list_collection_names( filter={"name": { "$regex": r"(snl|tasks)" }}) for name in names: collections[client.db[name].full_name] = client.db[name] for full_name, coll in collections.items(): logger.debug(f"{coll.count()} docs in {full_name}") ctx.obj["COLLECTIONS"] = collections ctx.obj["NMAX"] = nmax ctx.obj["SKIP"] = skip
def wrapper(*args, **kwargs): ctx = click.get_current_context() ctx.grand_parent = ctx.parent.parent if not ctx.grand_parent.params["sbatch"]: return ctx.invoke(func, *args, **kwargs) run = ctx.grand_parent.params["run"] if run: click.secho("SBATCH MODE! Submitting to SLURM queue.", fg="green") directory = ctx.parent.params.get("directory") if not directory: raise EmmetCliError( f"{ctx.parent.command_path} needs --directory option!") track_dir = os.path.join(directory, ".emmet") if run and not os.path.exists(track_dir): os.mkdir(track_dir) logger.debug(f"{track_dir} created") bb = ctx.grand_parent.params["bb"] yes = ctx.grand_parent.params["yes"] if bb: if not yes: click.confirm("Did you run `module unload esslurm`?", abort=True) subdir = directory.rsplit(os.sep, 1)[1] stage_in = f"#DW stage_in source={directory} " stage_in += f"destination=$DW_JOB_STRIPED/{subdir} type=directory" script = [ "#DW jobdw capacity=10TB access_mode=striped type=scratch", stage_in, "srun hostname", "", ] command = "\n".join(script) slurm_kwargs = { "qos": "premium", "nodes": 1, "tasks-per-node": 1, "constraint": "haswell", "time": "48:00:00", } else: if not yes: click.confirm("Did you run `module load esslurm`?", abort=True) slurm_kwargs = { "qos": "xfer", "time": "48:00:00", "licenses": "SCRATCH", "mem": "30GB", } command = "" s = Slurm( ctx.command_path.replace(" ", "-"), slurm_kwargs=slurm_kwargs, date_in_name=False, scripts_dir=track_dir, log_dir=track_dir, bash_strict=False, ) command += reconstruct_command(sbatch=True) slurmpy_stderr = io.StringIO() with contextlib.redirect_stderr(slurmpy_stderr): s.run(command, _cmd="sbatch" if run else "cat", tries=1) # 6 days ret = slurmpy_stderr.getvalue()[2:-1] logger.info("\n" + ret.encode("utf-8").decode("unicode_escape")) # TODO add jobid to SUBMITTED.value return ReturnCodes.SUBMITTED if run else ReturnCodes.SUCCESS
def parse(task_ids, snl_metas, nproc, store_volumetric_data): # noqa: C901 """Parse VASP launchers into tasks""" ctx = click.get_current_context() if "CLIENT" not in ctx.obj: raise EmmetCliError("Use --spec to set target DB for tasks!") run = ctx.parent.parent.params["run"] nmax = ctx.parent.params["nmax"] directory = ctx.parent.params["directory"].rstrip(os.sep) tag = os.path.basename(directory) target = ctx.obj["CLIENT"] snl_collection = target.db.snls_user logger.info( f"Connected to {target.collection.full_name} with {target.collection.count()} tasks." ) ensure_indexes( ["task_id", "tags", "dir_name", "retired_task_id"], [target.collection] ) chunk_size = math.ceil(nmax / nproc) if nproc > 1 and nmax <= chunk_size: nproc = 1 logger.warning( f"nmax = {nmax} but chunk size = {chunk_size} -> sequential parsing." ) pool = multiprocessing.Pool(processes=nproc) gen = VaspDirsGenerator() iterator = iterator_slice(gen, chunk_size) # process in chunks queue = deque() count = 0 sep_tid = None if task_ids: with open(task_ids, "r") as f: task_ids = json.load(f) else: # reserve list of task_ids to avoid collisions during multiprocessing # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs # NOTE use regex first to reduce size of distinct below 16MB q = {"task_id": {"$regex": r"^mp-\d{7,}$"}} all_task_ids = [ t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1}) ] if not all_task_ids: all_task_ids = target.collection.distinct("task_id") next_tid = max(int(tid.split("-")[-1]) for tid in all_task_ids) + 1 lst = [f"mp-{next_tid + n}" for n in range(nmax)] task_ids = chunks(lst, chunk_size) if run: sep_tid = f"mp-{next_tid + nmax}" target.collection.insert({"task_id": sep_tid}) logger.info(f"Inserted separator task with task_id {sep_tid}.") logger.info(f"Reserved {len(lst)} task ID(s).") else: logger.info(f"Would reserve {len(lst)} task ID(s).") sep_snlid = None if snl_metas: with open(snl_metas, "r") as f: snl_metas = json.load(f) # reserve list of snl_ids to avoid collisions during multiprocessing # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs all_snl_ids = snl_collection.distinct("snl_id") prefixes = set() next_snlid = -1 for snlid in all_snl_ids: prefix, index = snlid.split("-", 1) index = int(index) prefixes.add(prefix) if index > next_snlid: next_snlid = index next_snlid += 1 prefix = prefixes.pop() # NOTE use the first prefix found nsnls = len(snl_metas) for n, launcher in enumerate(snl_metas): snl_id = f"{prefix}-{next_snlid + n}" snl_metas[launcher]["snl_id"] = snl_id if run: sep_snlid = f"{prefix}-{next_snlid + nsnls}" snl_collection.insert({"snl_id": sep_snlid}) logger.info(f"Inserted separator SNL with snl_id {sep_snlid}.") logger.info(f"Reserved {nsnls} SNL ID(s).") else: logger.info(f"Would reserve {nsnls} SNL ID(s).") while iterator or queue: try: args = [next(iterator), tag, task_ids, snl_metas] queue.append(pool.apply_async(parse_vasp_dirs, args)) except (StopIteration, TypeError): iterator = None while queue and (len(queue) >= pool._processes or not iterator): process = queue.pop() process.wait(1) if not process.ready(): queue.append(process) else: count += process.get() pool.close() if run: logger.info( f"Successfully parsed and inserted {count}/{gen.value} tasks in {directory}." ) if sep_tid: target.collection.remove({"task_id": sep_tid}) logger.info(f"Removed separator task {sep_tid}.") if sep_snlid: snl_collection.remove({"snl_id": sep_snlid}) logger.info(f"Removed separator SNL {sep_snlid}.") else: logger.info(f"Would parse and insert {count}/{gen.value} tasks in {directory}.") return ReturnCodes.SUCCESS if count and gen.value else ReturnCodes.WARNING
def prep(ctx, archive, authors): """prep structures from an archive for submission""" run = ctx.obj["RUN"] collections = ctx.obj["COLLECTIONS"] snl_collection = ctx.obj["CLIENT"].db.snls handler = ctx.obj["MONGO_HANDLER"] nmax = ctx.obj["NMAX"] skip = ctx.obj["SKIP"] # TODO no_dupe_check flag fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""] logger.info(click.style(f"tag: {tag}", fg="cyan")) if sec_ext: ext = "".join([sec_ext, ext]) exts = ["tar.gz", ".tgz", "bson.gz", ".zip"] if ext not in exts: raise EmmetCliError( f"{ext} not supported (yet)! Please use one of {exts}.") meta = {"authors": [Author.parse_author(a) for a in authors]} references = meta.get("references", "").strip() source_ids_scanned = handler.collection.distinct("source_id", {"tags": tag}) # TODO add archive of StructureNL files input_structures, source_total = [], None if ext == "bson.gz": input_bson = gzip.open(archive) source_total = count_file_documents(input_bson) for doc in bson.decode_file_iter(input_bson): if len(input_structures) >= nmax: break if skip and doc["db_id"] in source_ids_scanned: continue elements = set([ specie["element"] for site in doc["structure"]["sites"] for specie in site["species"] ]) for l in SETTINGS.skip_labels: if l in elements: logger.log( logging.ERROR if run else logging.INFO, f'Skip structure {doc["db_id"]}: unsupported element {l}!', extra={ "tags": [tag], "source_id": doc["db_id"] }, ) break else: s = TransformedStructure.from_dict(doc["structure"]) s.source_id = doc["db_id"] input_structures.append(s) elif ext == ".zip": input_zip = ZipFile(archive) namelist = input_zip.namelist() source_total = len(namelist) for fname in namelist: if len(input_structures) >= nmax: break if skip and fname in source_ids_scanned: continue contents = input_zip.read(fname) fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) else: tar = tarfile.open(archive, "r:gz") members = tar.getmembers() source_total = len(members) for member in members: if os.path.basename(member.name).startswith("."): continue if len(input_structures) >= nmax: break fname = member.name.lower() if skip and fname in source_ids_scanned: continue f = tar.extractfile(member) if f: contents = f.read().decode("utf-8") fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) total = len(input_structures) logger.info( f"{total} of {source_total} structure(s) loaded " f"({len(source_ids_scanned)} unique structures already scanned).") save_logs(ctx) snls, index = [], None for istruct in input_structures: # number of log messages equals number of structures processed if --run # only logger.warning goes to DB if --run if run and len(handler.buffer) >= handler.buffer_size: insert_snls(ctx, snls) struct = (istruct.final_structure if isinstance( istruct, TransformedStructure) else istruct) struct.remove_oxidation_states() struct = struct.get_primitive_structure() formula = struct.composition.reduced_formula sg = get_sg(struct) if not (struct.is_ordered and struct.is_valid()): logger.log( logging.WARNING if run else logging.INFO, f"Skip structure {istruct.source_id}: disordered or invalid!", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) continue for full_name, coll in collections.items(): # load canonical structures in collection for current formula and # duplicate-check them against current structure load_canonical_structures(ctx, full_name, formula) for canonical_structure in canonical_structures[full_name][ formula].get(sg, []): if structures_match(struct, canonical_structure): logger.log( logging.WARNING if run else logging.INFO, f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, "duplicate_dbname": full_name, "duplicate_id": canonical_structure.id, }, ) break else: continue # no duplicate found -> continue to next collection break # duplicate found else: # no duplicates in any collection prefix = snl_collection.database.name if index is None: # get start index for SNL id snl_ids = snl_collection.distinct("snl_id") index = max( [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids]) index += 1 snl_id = "{}-{}".format(prefix, index) kwargs = {"references": references, "projects": [tag]} if isinstance(istruct, TransformedStructure): snl = istruct.to_snl(meta["authors"], **kwargs) else: snl = StructureNL(istruct, meta["authors"], **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_dct["snl_id"] = snl_id snls.append(snl_dct) logger.log( logging.WARNING if run else logging.INFO, f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) # final save if run: insert_snls(ctx, snls)
def admin(ctx): """Administrative and utility commands""" if "CLIENT" not in ctx.obj: raise EmmetCliError("--spec option required with admin sub-command!")
def parse(task_ids, nproc): """Parse VASP launchers into tasks""" ctx = click.get_current_context() if "CLIENT" not in ctx.obj: raise EmmetCliError(f"Use --spec to set target DB for tasks!") run = ctx.parent.parent.params["run"] nmax = ctx.parent.params["nmax"] directory = ctx.parent.params["directory"].rstrip(os.sep) tag = os.path.basename(directory) target = ctx.obj["CLIENT"] logger.info( f"Connected to {target.collection.full_name} with {target.collection.count()} tasks." ) ensure_indexes(["task_id", "tags", "dir_name", "retired_task_id"], [target.collection]) chunk_size = math.ceil(nmax / nproc) if nproc > 1 and nmax <= chunk_size: nproc = 1 logger.warning( f"nmax = {nmax} but chunk size = {chunk_size} -> sequential parsing." ) pool = multiprocessing.Pool(processes=nproc) gen = VaspDirsGenerator() iterator = iterator_slice(gen, chunk_size) # process in chunks queue = deque() count = 0 sep_tid = None if task_ids: with open(task_ids, "r") as f: task_ids = json.load(f) else: # reserve list of task_ids to avoid collisions during multiprocessing # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs all_task_ids = target.collection.distinct("task_id") next_tid = max(int(tid.split("-")[-1]) for tid in all_task_ids) + 1 lst = [f"mp-{next_tid + n}" for n in range(nmax)] if run: sep_tid = f"mp-{next_tid + nmax}" target.collection.insert({"task_id": sep_tid}) logger.info(f"Inserted separator task with task_id {sep_tid}.") task_ids = chunks(lst, chunk_size) logger.info(f"Reserved {len(lst)} task ID(s).") while iterator or queue: try: args = [next(iterator), tag, task_ids] queue.append(pool.apply_async(parse_vasp_dirs, args)) except (StopIteration, TypeError): iterator = None while queue and (len(queue) >= pool._processes or not iterator): process = queue.pop() process.wait(1) if not process.ready(): queue.append(process) else: count += process.get() pool.close() if run: logger.info( f"Successfully parsed and inserted {count}/{gen.value} tasks in {directory}." ) if sep_tid: target.collection.remove({"task_id": sep_tid}) logger.info(f"Removed separator task {sep_tid}.") else: logger.info( f"Would parse and insert {count}/{gen.value} tasks in {directory}." ) return ReturnCodes.SUCCESS if count and gen.value else ReturnCodes.WARNING