Esempio n. 1
0
def emmet(spec, run, issue, sbatch, bb, yes, no_dupe_check, verbose):
    """Command line interface for emmet"""
    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    ctx = click.get_current_context()
    ctx.ensure_object(dict)

    if not sbatch and bb:
        raise EmmetCliError("Burst buffer only available in SBatch mode (--sbatch).")

    if spec:
        client = calcdb_from_mgrant(spec)
        ctx.obj["CLIENT"] = client
        # ctx.obj["MONGO_HANDLER"] = BufferedMongoHandler(
        #    host=client.host,
        #    port=client.port,
        #    database_name=client.db_name,
        #    username=client.user,
        #    password=client.password,
        #    level=logging.WARNING,
        #    authentication_db=client.db_name,
        #    collection="emmet_logs",
        #    buffer_periodical_flush_timing=False,  # flush manually
        # )
        # logger.addHandler(ctx.obj["MONGO_HANDLER"])
        # coll = ctx.obj["MONGO_HANDLER"].collection
        # ensure_indexes(SETTINGS.log_fields, [coll])

    if run:
        if not issue:
            raise EmmetCliError(f"Need issue number via --issue!")

        ctx.obj["LOG_STREAM"] = StringIO()
        memory_handler = logging.StreamHandler(ctx.obj["LOG_STREAM"])
        formatter = logging.Formatter(
            "%(asctime)s %(name)-12s %(levelname)-8s %(message)s"
        )
        memory_handler.setFormatter(formatter)
        logger.addHandler(memory_handler)

        CREDENTIALS = os.path.join(os.path.expanduser("~"), ".emmet_credentials")
        if not os.path.exists(CREDENTIALS):
            user = click.prompt("GitHub Username")
            password = click.prompt("GitHub Password", hide_input=True)
            auth = authorize(
                user,
                password,
                ["user", "repo", "gist"],
                "emmet CLI",
                two_factor_callback=opt_prompt,
            )
            with open(CREDENTIALS, "w") as fd:
                fd.write(auth.token)

        with open(CREDENTIALS, "r") as fd:
            token = fd.readline().strip()
            ctx.obj["GH"] = login(token=token)
    else:
        click.secho("DRY RUN! Add --run flag to execute changes.", fg="green")

    install_mp_handler(logger=logger)
Esempio n. 2
0
File: tasks.py Progetto: utf/emmet
def check_pattern(nested_allowed=False):
    ctx = click.get_current_context()
    pattern = ctx.parent.params["pattern"]
    if not nested_allowed and os.sep in pattern:
        raise EmmetCliError(f"Nested pattern ({pattern}) not allowed!")
    elif not pattern.startswith(PREFIX):
        raise EmmetCliError(f"Pattern ({pattern}) only allowed to start with {PREFIX}!")
Esempio n. 3
0
File: calc.py Progetto: utf/emmet
def get_format(fname):
    if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
        return "cif"
    elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"):
        return "json"
    else:
        raise EmmetCliError(f"reading {fname} not supported (yet)")
Esempio n. 4
0
    def wrapper(*args, **kwargs):
        ret = func(*args, **kwargs)
        ctx = click.get_current_context()
        if not isinstance(ret, ReturnCodes):
            raise EmmetCliError(f"Tracking `{ctx.command_path}` requires ReturnCode!")

        if ctx.grand_parent.params["run"]:
            logger.info(ret.value)
            gh = ctx.grand_parent.obj["GH"]
            user = gh.me().login
            issue_number = ctx.grand_parent.params["issue"]
            issue = gh.issue(
                SETTINGS.tracker["org"], SETTINGS.tracker["repo"], issue_number
            )

            # gists iterator/resource based on latest etag
            ETAG = os.path.join(os.path.expanduser("~"), ".emmet_etag")
            etag = None
            if os.path.exists(ETAG):
                with open(ETAG, "r") as fd:
                    etag = fd.readline().strip()

            gists_iterator = gh.gists(etag=etag)
            if gists_iterator.etag != etag:
                with open(ETAG, "w") as fd:
                    fd.write(gists_iterator.etag)

            # create or retrieve gist for log files
            gist_name = f"#{issue_number}-{SETTINGS.tracker['repo']}.md"
            for gist in gists_iterator:
                if gist.files and gist_name in gist.files:
                    break
            else:
                description = f"Logs for {SETTINGS.tracker['repo']}#{issue_number}"
                files = {gist_name: {"content": issue.html_url}}
                gist = gh.create_gist(description, files, public=False)
                zip_base = gist.html_url.replace(gist.id, user + "/" + gist.id)
                txt = GIST_COMMENT_TEMPLATE.format(
                    gist.html_url, zip_base + "/archive/master.zip"
                )
                comment = issue.create_comment(txt)
                logger.info(f"Gist Comment: {comment.html_url}")

            # update gist with logs for new command
            logger.info(f"Log Gist: {gist.html_url}")
            now = str(datetime.now()).replace(" ", "-")
            filename = ctx.command_path.replace(" ", "-") + f"_{now}"
            logs = ctx.grand_parent.obj["LOG_STREAM"]
            gist.edit(files={filename: {"content": logs.getvalue()}})

            if not ctx.grand_parent.params["sbatch"]:
                # add comment for new command
                command = reconstruct_command()
                raw_url = f"{GIST_RAW_URL}/{user}/{gist.id}/raw/{filename}"
                txt = COMMENT_TEMPLATE.format(
                    ctx.command_path, ret.value, command, raw_url
                )
                comment = issue.create_comment(txt)
                logger.info(comment.html_url)
Esempio n. 5
0
File: calc.py Progetto: utf/emmet
def count_file_documents(file_obj):
    """Counts how many documents provided BSON file contains"""
    cnt = 0
    while True:
        # Read size of next object.
        size_data = file_obj.read(4)
        if len(size_data) == 0:
            break  # Finished with file normally.
        elif len(size_data) != 4:
            raise EmmetCliError("Invalid BSON: cut off in middle of objsize")
        obj_size = _UNPACK_INT(size_data)[0] - 4
        file_obj.seek(obj_size, os.SEEK_CUR)
        cnt += 1
    file_obj.seek(0)
    return cnt
Esempio n. 6
0
File: calc.py Progetto: utf/emmet
def calc(ctx, specs, nmax, skip):
    """Set up calculations to optimize structures using VASP"""
    if "CLIENT" not in ctx.obj:
        raise EmmetCliError("--spec option required with calc sub-command!")

    collections = {}
    for coll in [ctx.obj["CLIENT"].db.snls, ctx.obj["CLIENT"].db.tasks]:
        collections[coll.full_name] = coll  # user collections

    for spec in specs:
        client = calcdb_from_mgrant(spec)
        names = client.db.list_collection_names(
            filter={"name": {
                "$regex": r"(snl|tasks)"
            }})
        for name in names:
            collections[client.db[name].full_name] = client.db[name]

    for full_name, coll in collections.items():
        logger.debug(f"{coll.count()} docs in {full_name}")

    ctx.obj["COLLECTIONS"] = collections
    ctx.obj["NMAX"] = nmax
    ctx.obj["SKIP"] = skip
Esempio n. 7
0
    def wrapper(*args, **kwargs):
        ctx = click.get_current_context()
        ctx.grand_parent = ctx.parent.parent
        if not ctx.grand_parent.params["sbatch"]:
            return ctx.invoke(func, *args, **kwargs)

        run = ctx.grand_parent.params["run"]
        if run:
            click.secho("SBATCH MODE! Submitting to SLURM queue.", fg="green")

        directory = ctx.parent.params.get("directory")
        if not directory:
            raise EmmetCliError(
                f"{ctx.parent.command_path} needs --directory option!")

        track_dir = os.path.join(directory, ".emmet")
        if run and not os.path.exists(track_dir):
            os.mkdir(track_dir)
            logger.debug(f"{track_dir} created")

        bb = ctx.grand_parent.params["bb"]
        yes = ctx.grand_parent.params["yes"]
        if bb:
            if not yes:
                click.confirm("Did you run `module unload esslurm`?",
                              abort=True)
            subdir = directory.rsplit(os.sep, 1)[1]
            stage_in = f"#DW stage_in source={directory} "
            stage_in += f"destination=$DW_JOB_STRIPED/{subdir} type=directory"
            script = [
                "#DW jobdw capacity=10TB access_mode=striped type=scratch",
                stage_in,
                "srun hostname",
                "",
            ]

            command = "\n".join(script)
            slurm_kwargs = {
                "qos": "premium",
                "nodes": 1,
                "tasks-per-node": 1,
                "constraint": "haswell",
                "time": "48:00:00",
            }
        else:
            if not yes:
                click.confirm("Did you run `module load esslurm`?", abort=True)
            slurm_kwargs = {
                "qos": "xfer",
                "time": "48:00:00",
                "licenses": "SCRATCH",
                "mem": "30GB",
            }
            command = ""

        s = Slurm(
            ctx.command_path.replace(" ", "-"),
            slurm_kwargs=slurm_kwargs,
            date_in_name=False,
            scripts_dir=track_dir,
            log_dir=track_dir,
            bash_strict=False,
        )

        command += reconstruct_command(sbatch=True)
        slurmpy_stderr = io.StringIO()
        with contextlib.redirect_stderr(slurmpy_stderr):
            s.run(command, _cmd="sbatch" if run else "cat", tries=1)  # 6 days
        ret = slurmpy_stderr.getvalue()[2:-1]
        logger.info("\n" + ret.encode("utf-8").decode("unicode_escape"))
        # TODO add jobid to SUBMITTED.value
        return ReturnCodes.SUBMITTED if run else ReturnCodes.SUCCESS
Esempio n. 8
0
def parse(task_ids, snl_metas, nproc, store_volumetric_data):  # noqa: C901
    """Parse VASP launchers into tasks"""
    ctx = click.get_current_context()
    if "CLIENT" not in ctx.obj:
        raise EmmetCliError("Use --spec to set target DB for tasks!")

    run = ctx.parent.parent.params["run"]
    nmax = ctx.parent.params["nmax"]
    directory = ctx.parent.params["directory"].rstrip(os.sep)
    tag = os.path.basename(directory)
    target = ctx.obj["CLIENT"]
    snl_collection = target.db.snls_user
    logger.info(
        f"Connected to {target.collection.full_name} with {target.collection.count()} tasks."
    )
    ensure_indexes(
        ["task_id", "tags", "dir_name", "retired_task_id"], [target.collection]
    )

    chunk_size = math.ceil(nmax / nproc)
    if nproc > 1 and nmax <= chunk_size:
        nproc = 1
        logger.warning(
            f"nmax = {nmax} but chunk size = {chunk_size} -> sequential parsing."
        )

    pool = multiprocessing.Pool(processes=nproc)
    gen = VaspDirsGenerator()
    iterator = iterator_slice(gen, chunk_size)  # process in chunks
    queue = deque()
    count = 0

    sep_tid = None
    if task_ids:
        with open(task_ids, "r") as f:
            task_ids = json.load(f)
    else:
        # reserve list of task_ids to avoid collisions during multiprocessing
        # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
        # NOTE use regex first to reduce size of distinct below 16MB
        q = {"task_id": {"$regex": r"^mp-\d{7,}$"}}
        all_task_ids = [
            t["task_id"] for t in target.collection.find(q, {"_id": 0, "task_id": 1})
        ]
        if not all_task_ids:
            all_task_ids = target.collection.distinct("task_id")

        next_tid = max(int(tid.split("-")[-1]) for tid in all_task_ids) + 1
        lst = [f"mp-{next_tid + n}" for n in range(nmax)]
        task_ids = chunks(lst, chunk_size)

        if run:
            sep_tid = f"mp-{next_tid + nmax}"
            target.collection.insert({"task_id": sep_tid})
            logger.info(f"Inserted separator task with task_id {sep_tid}.")
            logger.info(f"Reserved {len(lst)} task ID(s).")
        else:
            logger.info(f"Would reserve {len(lst)} task ID(s).")

    sep_snlid = None
    if snl_metas:
        with open(snl_metas, "r") as f:
            snl_metas = json.load(f)

        # reserve list of snl_ids to avoid collisions during multiprocessing
        # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
        all_snl_ids = snl_collection.distinct("snl_id")
        prefixes = set()
        next_snlid = -1

        for snlid in all_snl_ids:
            prefix, index = snlid.split("-", 1)
            index = int(index)
            prefixes.add(prefix)
            if index > next_snlid:
                next_snlid = index

        next_snlid += 1
        prefix = prefixes.pop()  # NOTE use the first prefix found
        nsnls = len(snl_metas)

        for n, launcher in enumerate(snl_metas):
            snl_id = f"{prefix}-{next_snlid + n}"
            snl_metas[launcher]["snl_id"] = snl_id

        if run:
            sep_snlid = f"{prefix}-{next_snlid + nsnls}"
            snl_collection.insert({"snl_id": sep_snlid})
            logger.info(f"Inserted separator SNL with snl_id {sep_snlid}.")
            logger.info(f"Reserved {nsnls} SNL ID(s).")
        else:
            logger.info(f"Would reserve {nsnls} SNL ID(s).")

    while iterator or queue:
        try:
            args = [next(iterator), tag, task_ids, snl_metas]
            queue.append(pool.apply_async(parse_vasp_dirs, args))
        except (StopIteration, TypeError):
            iterator = None

        while queue and (len(queue) >= pool._processes or not iterator):
            process = queue.pop()
            process.wait(1)
            if not process.ready():
                queue.append(process)
            else:
                count += process.get()

    pool.close()
    if run:
        logger.info(
            f"Successfully parsed and inserted {count}/{gen.value} tasks in {directory}."
        )
        if sep_tid:
            target.collection.remove({"task_id": sep_tid})
            logger.info(f"Removed separator task {sep_tid}.")
        if sep_snlid:
            snl_collection.remove({"snl_id": sep_snlid})
            logger.info(f"Removed separator SNL {sep_snlid}.")
    else:
        logger.info(f"Would parse and insert {count}/{gen.value} tasks in {directory}.")
    return ReturnCodes.SUCCESS if count and gen.value else ReturnCodes.WARNING
Esempio n. 9
0
File: calc.py Progetto: utf/emmet
def prep(ctx, archive, authors):
    """prep structures from an archive for submission"""
    run = ctx.obj["RUN"]
    collections = ctx.obj["COLLECTIONS"]
    snl_collection = ctx.obj["CLIENT"].db.snls
    handler = ctx.obj["MONGO_HANDLER"]
    nmax = ctx.obj["NMAX"]
    skip = ctx.obj["SKIP"]
    # TODO no_dupe_check flag

    fname, ext = os.path.splitext(os.path.basename(archive))
    tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""]
    logger.info(click.style(f"tag: {tag}", fg="cyan"))
    if sec_ext:
        ext = "".join([sec_ext, ext])
    exts = ["tar.gz", ".tgz", "bson.gz", ".zip"]
    if ext not in exts:
        raise EmmetCliError(
            f"{ext} not supported (yet)! Please use one of {exts}.")

    meta = {"authors": [Author.parse_author(a) for a in authors]}
    references = meta.get("references", "").strip()
    source_ids_scanned = handler.collection.distinct("source_id",
                                                     {"tags": tag})

    # TODO add archive of StructureNL files
    input_structures, source_total = [], None
    if ext == "bson.gz":
        input_bson = gzip.open(archive)
        source_total = count_file_documents(input_bson)
        for doc in bson.decode_file_iter(input_bson):
            if len(input_structures) >= nmax:
                break
            if skip and doc["db_id"] in source_ids_scanned:
                continue
            elements = set([
                specie["element"] for site in doc["structure"]["sites"]
                for specie in site["species"]
            ])
            for l in SETTINGS.skip_labels:
                if l in elements:
                    logger.log(
                        logging.ERROR if run else logging.INFO,
                        f'Skip structure {doc["db_id"]}: unsupported element {l}!',
                        extra={
                            "tags": [tag],
                            "source_id": doc["db_id"]
                        },
                    )
                    break
            else:
                s = TransformedStructure.from_dict(doc["structure"])
                s.source_id = doc["db_id"]
                input_structures.append(s)
    elif ext == ".zip":
        input_zip = ZipFile(archive)
        namelist = input_zip.namelist()
        source_total = len(namelist)
        for fname in namelist:
            if len(input_structures) >= nmax:
                break
            if skip and fname in source_ids_scanned:
                continue
            contents = input_zip.read(fname)
            fmt = get_format(fname)
            s = Structure.from_str(contents, fmt=fmt)
            s.source_id = fname
            input_structures.append(s)
    else:
        tar = tarfile.open(archive, "r:gz")
        members = tar.getmembers()
        source_total = len(members)
        for member in members:
            if os.path.basename(member.name).startswith("."):
                continue
            if len(input_structures) >= nmax:
                break
            fname = member.name.lower()
            if skip and fname in source_ids_scanned:
                continue
            f = tar.extractfile(member)
            if f:
                contents = f.read().decode("utf-8")
                fmt = get_format(fname)
                s = Structure.from_str(contents, fmt=fmt)
                s.source_id = fname
                input_structures.append(s)

    total = len(input_structures)
    logger.info(
        f"{total} of {source_total} structure(s) loaded "
        f"({len(source_ids_scanned)} unique structures already scanned).")

    save_logs(ctx)
    snls, index = [], None
    for istruct in input_structures:
        # number of log messages equals number of structures processed if --run
        # only logger.warning goes to DB if --run
        if run and len(handler.buffer) >= handler.buffer_size:
            insert_snls(ctx, snls)

        struct = (istruct.final_structure if isinstance(
            istruct, TransformedStructure) else istruct)
        struct.remove_oxidation_states()
        struct = struct.get_primitive_structure()
        formula = struct.composition.reduced_formula
        sg = get_sg(struct)

        if not (struct.is_ordered and struct.is_valid()):
            logger.log(
                logging.WARNING if run else logging.INFO,
                f"Skip structure {istruct.source_id}: disordered or invalid!",
                extra={
                    "formula": formula,
                    "spacegroup": sg,
                    "tags": [tag],
                    "source_id": istruct.source_id,
                },
            )
            continue

        for full_name, coll in collections.items():
            # load canonical structures in collection for current formula and
            # duplicate-check them against current structure
            load_canonical_structures(ctx, full_name, formula)
            for canonical_structure in canonical_structures[full_name][
                    formula].get(sg, []):
                if structures_match(struct, canonical_structure):
                    logger.log(
                        logging.WARNING if run else logging.INFO,
                        f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}",
                        extra={
                            "formula": formula,
                            "spacegroup": sg,
                            "tags": [tag],
                            "source_id": istruct.source_id,
                            "duplicate_dbname": full_name,
                            "duplicate_id": canonical_structure.id,
                        },
                    )
                    break
            else:
                continue  # no duplicate found -> continue to next collection

            break  # duplicate found
        else:
            # no duplicates in any collection
            prefix = snl_collection.database.name
            if index is None:
                # get start index for SNL id
                snl_ids = snl_collection.distinct("snl_id")
                index = max(
                    [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids])

            index += 1
            snl_id = "{}-{}".format(prefix, index)
            kwargs = {"references": references, "projects": [tag]}
            if isinstance(istruct, TransformedStructure):
                snl = istruct.to_snl(meta["authors"], **kwargs)
            else:
                snl = StructureNL(istruct, meta["authors"], **kwargs)

            snl_dct = snl.as_dict()
            snl_dct.update(get_meta_from_structure(struct))
            snl_dct["snl_id"] = snl_id
            snls.append(snl_dct)
            logger.log(
                logging.WARNING if run else logging.INFO,
                f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})",
                extra={
                    "formula": formula,
                    "spacegroup": sg,
                    "tags": [tag],
                    "source_id": istruct.source_id,
                },
            )

    # final save
    if run:
        insert_snls(ctx, snls)
Esempio n. 10
0
def admin(ctx):
    """Administrative and utility commands"""
    if "CLIENT" not in ctx.obj:
        raise EmmetCliError("--spec option required with admin sub-command!")
Esempio n. 11
0
def parse(task_ids, nproc):
    """Parse VASP launchers into tasks"""
    ctx = click.get_current_context()
    if "CLIENT" not in ctx.obj:
        raise EmmetCliError(f"Use --spec to set target DB for tasks!")

    run = ctx.parent.parent.params["run"]
    nmax = ctx.parent.params["nmax"]
    directory = ctx.parent.params["directory"].rstrip(os.sep)
    tag = os.path.basename(directory)
    target = ctx.obj["CLIENT"]
    logger.info(
        f"Connected to {target.collection.full_name} with {target.collection.count()} tasks."
    )
    ensure_indexes(["task_id", "tags", "dir_name", "retired_task_id"],
                   [target.collection])

    chunk_size = math.ceil(nmax / nproc)
    if nproc > 1 and nmax <= chunk_size:
        nproc = 1
        logger.warning(
            f"nmax = {nmax} but chunk size = {chunk_size} -> sequential parsing."
        )

    pool = multiprocessing.Pool(processes=nproc)
    gen = VaspDirsGenerator()
    iterator = iterator_slice(gen, chunk_size)  # process in chunks
    queue = deque()
    count = 0

    sep_tid = None
    if task_ids:
        with open(task_ids, "r") as f:
            task_ids = json.load(f)
    else:
        # reserve list of task_ids to avoid collisions during multiprocessing
        # insert empty doc with max ID + 1 into target collection for parallel SLURM jobs
        all_task_ids = target.collection.distinct("task_id")
        next_tid = max(int(tid.split("-")[-1]) for tid in all_task_ids) + 1
        lst = [f"mp-{next_tid + n}" for n in range(nmax)]
        if run:
            sep_tid = f"mp-{next_tid + nmax}"
            target.collection.insert({"task_id": sep_tid})
            logger.info(f"Inserted separator task with task_id {sep_tid}.")
        task_ids = chunks(lst, chunk_size)
        logger.info(f"Reserved {len(lst)} task ID(s).")

    while iterator or queue:
        try:
            args = [next(iterator), tag, task_ids]
            queue.append(pool.apply_async(parse_vasp_dirs, args))
        except (StopIteration, TypeError):
            iterator = None

        while queue and (len(queue) >= pool._processes or not iterator):
            process = queue.pop()
            process.wait(1)
            if not process.ready():
                queue.append(process)
            else:
                count += process.get()

    pool.close()
    if run:
        logger.info(
            f"Successfully parsed and inserted {count}/{gen.value} tasks in {directory}."
        )
        if sep_tid:
            target.collection.remove({"task_id": sep_tid})
            logger.info(f"Removed separator task {sep_tid}.")
    else:
        logger.info(
            f"Would parse and insert {count}/{gen.value} tasks in {directory}."
        )
    return ReturnCodes.SUCCESS if count and gen.value else ReturnCodes.WARNING