def run(self): logger.info("TagsBuilder starting...") self._build_indexes() logger.info("Initializing list of all new task_ids to process ...") previous_task_ids = [] for m in self._materials.find({"_tagsbuilder": {"$exists": True}}, {"_tagsbuilder.all_task_ids": 1}): previous_task_ids.extend(m["_tagsbuilder"]["all_task_ids"]) previous_task_ids = [dbid_to_int(t) for t in previous_task_ids] q = {"tags": {"$exists": True}, "task_id": {"$nin": previous_task_ids}, "state": "successful"} tasks = [t for t in self._tasks.find(q, {"task_id": 1, "tags": 1})] pbar = tqdm(tasks) for t in pbar: try: pbar.set_description("Processing task_id: {}".format(t['task_id'])) # get the corresponding materials id m = self._materials.find_one({"_tasksbuilder.all_task_ids": dbid_to_str(self._tasks_prefix, t["task_id"])}, {"material_id": 1, "tags": 1, "_tagsbuilder": 1}) if m: all_tags = t["tags"] if "tags" in m and m["tags"]: all_tags.extend(m["tags"]) all_tasks = [dbid_to_str(self._tasks_prefix, t["task_id"])] if "_tagsbuilder" in m: all_tasks.extend(m["_tagsbuilder"]["all_task_ids"]) all_tags = list(set(all_tags)) # filter duplicates self._materials.update_one({"material_id": m["material_id"]}, {"$set": {"tags": all_tags, "_tagsbuilder.all_task_ids": all_tasks}}) except: import traceback logger.exception("<---") logger.exception("There was an error processing task_id: {}".format(t["task_id"])) logger.exception(traceback.format_exc()) logger.exception("--->") logger.info("TagsBuilder finished processing.")
def run(self): logger.info("MaterialsTaskBuilder starting...") logger.info("Initializing list of all new task_ids to process ...") previous_task_ids = [] for m in self._materials.find({}, {"_tasksbuilder.all_task_ids": 1}): previous_task_ids.extend(m["_tasksbuilder"]["all_task_ids"]) q = { "state": "successful", "task_label": { "$in": self.supported_task_labels } } if self.query: common_keys = [k for k in q.keys() if k in self.query.keys()] if common_keys: raise ValueError( "User query parameter cannot contain key(s): {}".format( common_keys)) q.update(self.query) all_task_ids = [ dbid_to_str(self._t_prefix, t["task_id"]) for t in self._tasks.find(q, {"task_id": 1}) ] task_ids = [ t_id for t_id in all_task_ids if t_id not in previous_task_ids ] logger.info("There are {} new task_ids to process.".format( len(task_ids))) pbar = tqdm(task_ids) for t_id in pbar: pbar.set_description("Processing task_id: {}".format(t_id)) try: taskdoc = self._tasks.find_one({"task_id": dbid_to_int(t_id)}) m_id = self._match_material(taskdoc) if not m_id: m_id = self._create_new_material(taskdoc) self._update_material(m_id, taskdoc) except: import traceback logger.exception("<---") logger.exception( "There was an error processing task_id: {}".format(t_id)) logger.exception(traceback.format_exc()) logger.exception("--->") logger.info("TasksMaterialsBuilder finished processing.")
def _create_new_material(self, taskdoc): """ Create a new material document. Args: taskdoc (dict): a JSON-like task document Returns: (int) - material_id of the new document """ doc = {"created_at": datetime.utcnow()} doc["_tasksbuilder"] = { "all_task_ids": [], "prop_metadata": { "labels": {}, "task_ids": {} }, "updated_at": datetime.utcnow() } doc["spacegroup"] = taskdoc["output"]["spacegroup"] doc["structure"] = taskdoc["output"]["structure"] doc["material_id"] = dbid_to_str( self._m_prefix, self._counter.find_one_and_update( {"_id": "materialid"}, {"$inc": { "c": 1 }}, return_document=ReturnDocument.AFTER)["c"]) doc["sg_symbol"] = doc["spacegroup"]["symbol"] doc["sg_number"] = doc["spacegroup"]["number"] for x in [ "formula_anonymous", "formula_pretty", "formula_reduced_abc", "elements", "nelements", "chemsys" ]: doc[x] = taskdoc[x] if "parent_structure" in taskdoc: doc["parent_structure"] = taskdoc["parent_structure"] t_struct = Structure.from_dict( taskdoc["parent_structure"]["structure"]) doc["parent_structure"][ "formula_reduced_abc"] = t_struct.composition.reduced_formula self._materials.insert_one(doc) return doc["material_id"]
def run(self): logger.info("MaterialsTaskBuilder starting...") logger.info("Initializing list of all new task_ids to process ...") previous_task_ids = [] for m in self._materials.find({}, {"_tasksbuilder.all_task_ids": 1}): previous_task_ids.extend(m["_tasksbuilder"]["all_task_ids"]) q = {"state": "successful", "task_label": {"$in": self.supported_task_labels}} if self.query: common_keys = [k for k in q.keys() if k in self.query.keys()] if common_keys: raise ValueError("User query parameter cannot contain key(s): {}". format(common_keys)) q.update(self.query) all_task_ids = [dbid_to_str(self._t_prefix, t["task_id"]) for t in self._tasks.find(q, {"task_id": 1})] task_ids = [t_id for t_id in all_task_ids if t_id not in previous_task_ids] logger.info("There are {} new task_ids to process.".format(len(task_ids))) pbar = tqdm(task_ids) for t_id in pbar: pbar.set_description("Processing task_id: {}".format(t_id)) try: taskdoc = self._tasks.find_one({"task_id": dbid_to_int(t_id)}) m_id = self._match_material(taskdoc) if not m_id: m_id = self._create_new_material(taskdoc) self._update_material(m_id, taskdoc) except: import traceback logger.exception("<---") logger.exception("There was an error processing task_id: {}".format(t_id)) logger.exception(traceback.format_exc()) logger.exception("--->") logger.info("TasksMaterialsBuilder finished processing.")
def _create_new_material(self, taskdoc): """ Create a new material document. Args: taskdoc (dict): a JSON-like task document Returns: (int) - material_id of the new document """ doc = {"created_at": datetime.utcnow()} doc["_tasksbuilder"] = {"all_task_ids": [], "prop_metadata": {"labels": {}, "task_ids": {}}, "updated_at": datetime.utcnow()} doc["spacegroup"] = taskdoc["output"]["spacegroup"] doc["structure"] = taskdoc["output"]["structure"] doc["material_id"] = dbid_to_str( self._m_prefix, self._counter.find_one_and_update( {"_id": "materialid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"]) doc["sg_symbol"] = doc["spacegroup"]["symbol"] doc["sg_number"] = doc["spacegroup"]["number"] for x in ["formula_anonymous", "formula_pretty", "formula_reduced_abc", "elements", "nelements", "chemsys"]: doc[x] = taskdoc[x] if "parent_structure" in taskdoc: doc["parent_structure"] = taskdoc["parent_structure"] t_struct = Structure.from_dict(taskdoc["parent_structure"]["structure"]) doc["parent_structure"]["formula_reduced_abc"] = t_struct.composition.reduced_formula self._materials.insert_one(doc) return doc["material_id"]
def _update_material(self, m_id, taskdoc): """ Update a material document based on a new task and using complex logic Args: m_id (int): material_id for material document to update taskdoc (dict): a JSON-like task document """ # For each materials property, figure out what kind of task the data is currently based on # as defined by the task label. This is used to decide if the new taskdoc is a type of # calculation that provides higher quality data for that property prop_tlabels = self._materials.find_one( {"material_id": m_id}, {"_tasksbuilder.prop_metadata.labels": 1 })["_tasksbuilder"]["prop_metadata"]["labels"] task_label = taskdoc[ "task_label"] # task label of new doc that updates this material # figure out what materials properties need to be updated based on new task for x in self.property_settings: for p in x["properties"]: # check if this is a valid task for getting the property if task_label in x["quality_scores"]: # assert: this is a valid task for the property # but is it the "best" task for that property (highest quality score)? t_quality = x["quality_scores"][task_label] m_quality = x["quality_scores"].get( prop_tlabels.get(p, None), None) # check if this task's quality is better than existing data # 3 possibilities: # i) materials property data not present, so this is best # ii) task quality higher based on task label # iii) task quality equal to materials; use lowest energy task if not m_quality or t_quality > m_quality \ or (t_quality == m_quality and taskdoc["output"]["energy_per_atom"] < self._materials.find_one({"material_id": m_id}, { "_tasksbuilder": 1})["_tasksbuilder"]["prop_metadata"][ "energies"][p]): # this task has better quality data # figure out where the property data lives in the materials doc and # in the task doc materials_key = "{}.{}".format(x["materials_key"], p) \ if x.get("materials_key") else p tasks_key = "{}.{}".format(x["tasks_key"], p) \ if x.get("tasks_key") else p # insert property data AND metadata about this task self._materials.\ update_one({"material_id": m_id}, {"$set": {materials_key: get_mongolike(taskdoc, tasks_key), "_tasksbuilder.prop_metadata.labels.{}".format(p): task_label, "_tasksbuilder.prop_metadata.task_ids.{}".format(p): dbid_to_str( self._t_prefix, taskdoc["task_id"]), "_tasksbuilder.prop_metadata.energies.{}".format(p): taskdoc["output"]["energy_per_atom"], "_tasksbuilder.updated_at": datetime.utcnow()}}) # copy property to document root if in properties_root # i.e., intentionally duplicate some data to the root level if p in self.properties_root: self._materials.\ update_one({"material_id": m_id}, {"$set": {p: get_mongolike(taskdoc, tasks_key)}}) # update the database to reflect that this task_id was already processed self._materials.update_one({"material_id": m_id}, { "$push": { "_tasksbuilder.all_task_ids": dbid_to_str(self._t_prefix, taskdoc["task_id"]) } })
def _update_material(self, m_id, taskdoc): """ Update a material document based on a new task and using complex logic Args: m_id (int): material_id for material document to update taskdoc (dict): a JSON-like task document """ # For each materials property, figure out what kind of task the data is currently based on # as defined by the task label. This is used to decide if the new taskdoc is a type of # calculation that provides higher quality data for that property prop_tlabels = self._materials.find_one( {"material_id": m_id}, {"_tasksbuilder.prop_metadata.labels": 1})[ "_tasksbuilder"]["prop_metadata"]["labels"] task_label = taskdoc["task_label"] # task label of new doc that updates this material # figure out what materials properties need to be updated based on new task for x in self.property_settings: for p in x["properties"]: # check if this is a valid task for getting the property if task_label in x["quality_scores"]: # assert: this is a valid task for the property # but is it the "best" task for that property (highest quality score)? t_quality = x["quality_scores"][task_label] m_quality = x["quality_scores"].get(prop_tlabels.get(p, None), None) # check if this task's quality is better than existing data # 3 possibilities: # i) materials property data not present, so this is best # ii) task quality higher based on task label # iii) task quality equal to materials; use lowest energy task if not m_quality or t_quality > m_quality \ or (t_quality == m_quality and taskdoc["output"]["energy_per_atom"] < self._materials.find_one({"material_id": m_id}, { "_tasksbuilder": 1})["_tasksbuilder"]["prop_metadata"][ "energies"][p]): # this task has better quality data # figure out where the property data lives in the materials doc and # in the task doc materials_key = "{}.{}".format(x["materials_key"], p) \ if x.get("materials_key") else p tasks_key = "{}.{}".format(x["tasks_key"], p) \ if x.get("tasks_key") else p # insert property data AND metadata about this task self._materials.\ update_one({"material_id": m_id}, {"$set": {materials_key: get_mongolike(taskdoc, tasks_key), "_tasksbuilder.prop_metadata.labels.{}".format(p): task_label, "_tasksbuilder.prop_metadata.task_ids.{}".format(p): dbid_to_str( self._t_prefix, taskdoc["task_id"]), "_tasksbuilder.prop_metadata.energies.{}".format(p): taskdoc["output"]["energy_per_atom"], "_tasksbuilder.updated_at": datetime.utcnow()}}) # copy property to document root if in properties_root # i.e., intentionally duplicate some data to the root level if p in self.properties_root: self._materials.\ update_one({"material_id": m_id}, {"$set": {p: get_mongolike(taskdoc, tasks_key)}}) # update the database to reflect that this task_id was already processed self._materials.update_one({"material_id": m_id}, {"$push": {"_tasksbuilder.all_task_ids": dbid_to_str( self._t_prefix, taskdoc["task_id"])}})