def load_canonical_structures(ctx, full_name, formula): from emmet.core.vasp.calc_types import task_type # TODO import error collection = ctx.obj["COLLECTIONS"][full_name] if formula not in canonical_structures[full_name]: canonical_structures[full_name][formula] = {} structures = defaultdict(list) if "tasks" in full_name: query = {"formula_pretty": formula} query.update(SETTINGS.task_base_query) projection = {"input.structure": 1, "task_id": 1, "orig_inputs": 1} tasks = collection.find(query, projection) for task in tasks: task_label = task_type(task["orig_inputs"], include_calc_type=False) if task_label == "Structure Optimization": s = load_structure(task["input"]["structure"]) s.id = task["task_id"] structures[get_sg(s)].append(s) elif "snl" in full_name: query = {"$or": [{k: formula} for k in SETTINGS.aggregation_keys]} query.update(SETTINGS.exclude) query.update(SETTINGS.base_query) for group in aggregate_by_formula(collection, query): for dct in group["structures"]: s = load_structure(dct) s.id = dct["snl_id"] if "snl_id" in dct else dct["task_id"] structures[get_sg(s)].append(s) if structures: for sg, slist in structures.items(): canonical_structures[full_name][formula][sg] = [ g[0] for g in group_structures(slist) ] total = sum([ len(x) for x in canonical_structures[full_name][formula].values() ]) logger.debug( f"{total} canonical structure(s) for {formula} in {full_name}")
def prep(ctx, archive, authors): """prep structures from an archive for submission""" run = ctx.obj["RUN"] collections = ctx.obj["COLLECTIONS"] snl_collection = ctx.obj["CLIENT"].db.snls handler = ctx.obj["MONGO_HANDLER"] nmax = ctx.obj["NMAX"] skip = ctx.obj["SKIP"] # TODO no_dupe_check flag fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""] logger.info(click.style(f"tag: {tag}", fg="cyan")) if sec_ext: ext = "".join([sec_ext, ext]) exts = ["tar.gz", ".tgz", "bson.gz", ".zip"] if ext not in exts: raise EmmetCliError( f"{ext} not supported (yet)! Please use one of {exts}.") meta = {"authors": [Author.parse_author(a) for a in authors]} references = meta.get("references", "").strip() source_ids_scanned = handler.collection.distinct("source_id", {"tags": tag}) # TODO add archive of StructureNL files input_structures, source_total = [], None if ext == "bson.gz": input_bson = gzip.open(archive) source_total = count_file_documents(input_bson) for doc in bson.decode_file_iter(input_bson): if len(input_structures) >= nmax: break if skip and doc["db_id"] in source_ids_scanned: continue elements = set([ specie["element"] for site in doc["structure"]["sites"] for specie in site["species"] ]) for l in SETTINGS.skip_labels: if l in elements: logger.log( logging.ERROR if run else logging.INFO, f'Skip structure {doc["db_id"]}: unsupported element {l}!', extra={ "tags": [tag], "source_id": doc["db_id"] }, ) break else: s = TransformedStructure.from_dict(doc["structure"]) s.source_id = doc["db_id"] input_structures.append(s) elif ext == ".zip": input_zip = ZipFile(archive) namelist = input_zip.namelist() source_total = len(namelist) for fname in namelist: if len(input_structures) >= nmax: break if skip and fname in source_ids_scanned: continue contents = input_zip.read(fname) fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) else: tar = tarfile.open(archive, "r:gz") members = tar.getmembers() source_total = len(members) for member in members: if os.path.basename(member.name).startswith("."): continue if len(input_structures) >= nmax: break fname = member.name.lower() if skip and fname in source_ids_scanned: continue f = tar.extractfile(member) if f: contents = f.read().decode("utf-8") fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) total = len(input_structures) logger.info( f"{total} of {source_total} structure(s) loaded " f"({len(source_ids_scanned)} unique structures already scanned).") save_logs(ctx) snls, index = [], None for istruct in input_structures: # number of log messages equals number of structures processed if --run # only logger.warning goes to DB if --run if run and len(handler.buffer) >= handler.buffer_size: insert_snls(ctx, snls) struct = (istruct.final_structure if isinstance( istruct, TransformedStructure) else istruct) struct.remove_oxidation_states() struct = struct.get_primitive_structure() formula = struct.composition.reduced_formula sg = get_sg(struct) if not (struct.is_ordered and struct.is_valid()): logger.log( logging.WARNING if run else logging.INFO, f"Skip structure {istruct.source_id}: disordered or invalid!", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) continue for full_name, coll in collections.items(): # load canonical structures in collection for current formula and # duplicate-check them against current structure load_canonical_structures(ctx, full_name, formula) for canonical_structure in canonical_structures[full_name][ formula].get(sg, []): if structures_match(struct, canonical_structure): logger.log( logging.WARNING if run else logging.INFO, f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, "duplicate_dbname": full_name, "duplicate_id": canonical_structure.id, }, ) break else: continue # no duplicate found -> continue to next collection break # duplicate found else: # no duplicates in any collection prefix = snl_collection.database.name if index is None: # get start index for SNL id snl_ids = snl_collection.distinct("snl_id") index = max( [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids]) index += 1 snl_id = "{}-{}".format(prefix, index) kwargs = {"references": references, "projects": [tag]} if isinstance(istruct, TransformedStructure): snl = istruct.to_snl(meta["authors"], **kwargs) else: snl = StructureNL(istruct, meta["authors"], **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_dct["snl_id"] = snl_id snls.append(snl_dct) logger.log( logging.WARNING if run else logging.INFO, f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) # final save if run: insert_snls(ctx, snls)