Beispiel #1
0
    def run(self):
        logger.info("TagsBuilder starting...")
        self._build_indexes()

        logger.info("Initializing list of all new task_ids to process ...")
        previous_task_ids = []
        for m in self._materials.find({"_tagsbuilder": {"$exists": True}},
                                      {"_tagsbuilder.all_task_ids": 1}):
            previous_task_ids.extend(m["_tagsbuilder"]["all_task_ids"])

        previous_task_ids = [dbid_to_int(t) for t in previous_task_ids]

        q = {"tags": {"$exists": True}, "task_id": {"$nin": previous_task_ids},
             "state": "successful"}

        tasks = [t for t in self._tasks.find(q, {"task_id": 1, "tags": 1})]
        pbar = tqdm(tasks)
        for t in pbar:
            try:
                pbar.set_description("Processing task_id: {}".format(t['task_id']))

                # get the corresponding materials id
                m = self._materials.find_one({"_tasksbuilder.all_task_ids":
                                                  dbid_to_str(self._tasks_prefix, t["task_id"])},
                                             {"material_id": 1, "tags": 1,
                                              "_tagsbuilder": 1})
                if m:
                    all_tags = t["tags"]
                    if "tags" in m and m["tags"]:
                        all_tags.extend(m["tags"])

                    all_tasks = [dbid_to_str(self._tasks_prefix, t["task_id"])]
                    if "_tagsbuilder" in m:
                        all_tasks.extend(m["_tagsbuilder"]["all_task_ids"])

                    all_tags = list(set(all_tags))  # filter duplicates
                    self._materials.update_one({"material_id": m["material_id"]},
                                               {"$set": {"tags": all_tags,
                                                         "_tagsbuilder.all_task_ids": all_tasks}})

            except:
                import traceback
                logger.exception("<---")
                logger.exception("There was an error processing task_id: {}".format(t["task_id"]))
                logger.exception(traceback.format_exc())
                logger.exception("--->")
        logger.info("TagsBuilder finished processing.")
Beispiel #2
0
    def run(self):
        logger.info("MaterialsTaskBuilder starting...")
        logger.info("Initializing list of all new task_ids to process ...")
        previous_task_ids = []
        for m in self._materials.find({}, {"_tasksbuilder.all_task_ids": 1}):
            previous_task_ids.extend(m["_tasksbuilder"]["all_task_ids"])

        q = {
            "state": "successful",
            "task_label": {
                "$in": self.supported_task_labels
            }
        }

        if self.query:
            common_keys = [k for k in q.keys() if k in self.query.keys()]
            if common_keys:
                raise ValueError(
                    "User query parameter cannot contain key(s): {}".format(
                        common_keys))
            q.update(self.query)

        all_task_ids = [
            dbid_to_str(self._t_prefix, t["task_id"])
            for t in self._tasks.find(q, {"task_id": 1})
        ]
        task_ids = [
            t_id for t_id in all_task_ids if t_id not in previous_task_ids
        ]

        logger.info("There are {} new task_ids to process.".format(
            len(task_ids)))

        pbar = tqdm(task_ids)
        for t_id in pbar:
            pbar.set_description("Processing task_id: {}".format(t_id))
            try:
                taskdoc = self._tasks.find_one({"task_id": dbid_to_int(t_id)})
                m_id = self._match_material(taskdoc)
                if not m_id:
                    m_id = self._create_new_material(taskdoc)
                self._update_material(m_id, taskdoc)

            except:
                import traceback
                logger.exception("<---")
                logger.exception(
                    "There was an error processing task_id: {}".format(t_id))
                logger.exception(traceback.format_exc())
                logger.exception("--->")

        logger.info("TasksMaterialsBuilder finished processing.")
Beispiel #3
0
    def _create_new_material(self, taskdoc):
        """
        Create a new material document.

        Args:
            taskdoc (dict): a JSON-like task document

        Returns:
            (int) - material_id of the new document
        """
        doc = {"created_at": datetime.utcnow()}
        doc["_tasksbuilder"] = {
            "all_task_ids": [],
            "prop_metadata": {
                "labels": {},
                "task_ids": {}
            },
            "updated_at": datetime.utcnow()
        }
        doc["spacegroup"] = taskdoc["output"]["spacegroup"]
        doc["structure"] = taskdoc["output"]["structure"]
        doc["material_id"] = dbid_to_str(
            self._m_prefix,
            self._counter.find_one_and_update(
                {"_id": "materialid"}, {"$inc": {
                    "c": 1
                }},
                return_document=ReturnDocument.AFTER)["c"])

        doc["sg_symbol"] = doc["spacegroup"]["symbol"]
        doc["sg_number"] = doc["spacegroup"]["number"]

        for x in [
                "formula_anonymous", "formula_pretty", "formula_reduced_abc",
                "elements", "nelements", "chemsys"
        ]:
            doc[x] = taskdoc[x]

        if "parent_structure" in taskdoc:
            doc["parent_structure"] = taskdoc["parent_structure"]
            t_struct = Structure.from_dict(
                taskdoc["parent_structure"]["structure"])
            doc["parent_structure"][
                "formula_reduced_abc"] = t_struct.composition.reduced_formula

        self._materials.insert_one(doc)

        return doc["material_id"]
Beispiel #4
0
    def run(self):
        logger.info("MaterialsTaskBuilder starting...")
        logger.info("Initializing list of all new task_ids to process ...")
        previous_task_ids = []
        for m in self._materials.find({}, {"_tasksbuilder.all_task_ids": 1}):
            previous_task_ids.extend(m["_tasksbuilder"]["all_task_ids"])

        q = {"state": "successful", "task_label": {"$in": self.supported_task_labels}}

        if self.query:
            common_keys = [k for k in q.keys() if k in self.query.keys()]
            if common_keys:
                raise ValueError("User query parameter cannot contain key(s): {}".
                                 format(common_keys))
            q.update(self.query)

        all_task_ids = [dbid_to_str(self._t_prefix, t["task_id"]) for t in
                        self._tasks.find(q, {"task_id": 1})]
        task_ids = [t_id for t_id in all_task_ids if t_id not in previous_task_ids]

        logger.info("There are {} new task_ids to process.".format(len(task_ids)))

        pbar = tqdm(task_ids)
        for t_id in pbar:
            pbar.set_description("Processing task_id: {}".format(t_id))
            try:
                taskdoc = self._tasks.find_one({"task_id": dbid_to_int(t_id)})
                m_id = self._match_material(taskdoc)
                if not m_id:
                    m_id = self._create_new_material(taskdoc)
                self._update_material(m_id, taskdoc)

            except:
                import traceback
                logger.exception("<---")
                logger.exception("There was an error processing task_id: {}".format(t_id))
                logger.exception(traceback.format_exc())
                logger.exception("--->")

        logger.info("TasksMaterialsBuilder finished processing.")
Beispiel #5
0
    def _create_new_material(self, taskdoc):
        """
        Create a new material document.

        Args:
            taskdoc (dict): a JSON-like task document

        Returns:
            (int) - material_id of the new document
        """
        doc = {"created_at": datetime.utcnow()}
        doc["_tasksbuilder"] = {"all_task_ids": [], "prop_metadata":
            {"labels": {}, "task_ids": {}}, "updated_at": datetime.utcnow()}
        doc["spacegroup"] = taskdoc["output"]["spacegroup"]
        doc["structure"] = taskdoc["output"]["structure"]
        doc["material_id"] = dbid_to_str(
            self._m_prefix, self._counter.find_one_and_update(
                {"_id": "materialid"}, {"$inc": {"c": 1}},
                return_document=ReturnDocument.AFTER)["c"])

        doc["sg_symbol"] = doc["spacegroup"]["symbol"]
        doc["sg_number"] = doc["spacegroup"]["number"]


        for x in ["formula_anonymous", "formula_pretty", "formula_reduced_abc", "elements",
                  "nelements", "chemsys"]:
            doc[x] = taskdoc[x]

        if "parent_structure" in taskdoc:
            doc["parent_structure"] = taskdoc["parent_structure"]
            t_struct = Structure.from_dict(taskdoc["parent_structure"]["structure"])
            doc["parent_structure"]["formula_reduced_abc"] = t_struct.composition.reduced_formula

        self._materials.insert_one(doc)

        return doc["material_id"]
Beispiel #6
0
    def _update_material(self, m_id, taskdoc):
        """
        Update a material document based on a new task and using complex logic

        Args:
            m_id (int): material_id for material document to update
            taskdoc (dict): a JSON-like task document
        """

        # For each materials property, figure out what kind of task the data is currently based on
        # as defined by the task label.  This is used to decide if the new taskdoc is a type of
        # calculation that provides higher quality data for that property
        prop_tlabels = self._materials.find_one(
            {"material_id": m_id},
            {"_tasksbuilder.prop_metadata.labels": 1
             })["_tasksbuilder"]["prop_metadata"]["labels"]

        task_label = taskdoc[
            "task_label"]  # task label of new doc that updates this material

        # figure out what materials properties need to be updated based on new task
        for x in self.property_settings:
            for p in x["properties"]:
                # check if this is a valid task for getting the property
                if task_label in x["quality_scores"]:
                    # assert: this is a valid task for the property
                    # but is it the "best" task for that property (highest quality score)?
                    t_quality = x["quality_scores"][task_label]
                    m_quality = x["quality_scores"].get(
                        prop_tlabels.get(p, None), None)
                    # check if this task's quality is better than existing data
                    # 3 possibilities:
                    # i) materials property data not present, so this is best
                    # ii) task quality higher based on task label
                    # iii) task quality equal to materials; use lowest energy task
                    if not m_quality or t_quality > m_quality \
                            or (t_quality == m_quality
                                and taskdoc["output"]["energy_per_atom"] <
                                    self._materials.find_one({"material_id": m_id}, {
                                        "_tasksbuilder": 1})["_tasksbuilder"]["prop_metadata"][
                                        "energies"][p]):

                        # this task has better quality data
                        # figure out where the property data lives in the materials doc and
                        # in the task doc
                        materials_key = "{}.{}".format(x["materials_key"], p) \
                            if x.get("materials_key") else p
                        tasks_key = "{}.{}".format(x["tasks_key"], p) \
                            if x.get("tasks_key") else p

                        # insert property data AND metadata about this task
                        self._materials.\
                            update_one({"material_id": m_id},
                                       {"$set": {materials_key: get_mongolike(taskdoc, tasks_key),
                                                 "_tasksbuilder.prop_metadata.labels.{}".format(p): task_label,
                                                 "_tasksbuilder.prop_metadata.task_ids.{}".format(p): dbid_to_str(
                                                     self._t_prefix, taskdoc["task_id"]),
                                                 "_tasksbuilder.prop_metadata.energies.{}".format(p): taskdoc["output"]["energy_per_atom"],
                                                 "_tasksbuilder.updated_at": datetime.utcnow()}})

                        # copy property to document root if in properties_root
                        # i.e., intentionally duplicate some data to the root level
                        if p in self.properties_root:
                            self._materials.\
                            update_one({"material_id": m_id},
                                       {"$set": {p: get_mongolike(taskdoc, tasks_key)}})

        # update the database to reflect that this task_id was already processed
        self._materials.update_one({"material_id": m_id}, {
            "$push": {
                "_tasksbuilder.all_task_ids":
                dbid_to_str(self._t_prefix, taskdoc["task_id"])
            }
        })
Beispiel #7
0
    def _update_material(self, m_id, taskdoc):
        """
        Update a material document based on a new task and using complex logic

        Args:
            m_id (int): material_id for material document to update
            taskdoc (dict): a JSON-like task document
        """

        # For each materials property, figure out what kind of task the data is currently based on
        # as defined by the task label.  This is used to decide if the new taskdoc is a type of
        # calculation that provides higher quality data for that property
        prop_tlabels = self._materials.find_one(
            {"material_id": m_id}, {"_tasksbuilder.prop_metadata.labels": 1})[
            "_tasksbuilder"]["prop_metadata"]["labels"]

        task_label = taskdoc["task_label"]  # task label of new doc that updates this material

        # figure out what materials properties need to be updated based on new task
        for x in self.property_settings:
            for p in x["properties"]:
                # check if this is a valid task for getting the property
                if task_label in x["quality_scores"]:
                    # assert: this is a valid task for the property
                    # but is it the "best" task for that property (highest quality score)?
                    t_quality = x["quality_scores"][task_label]
                    m_quality = x["quality_scores"].get(prop_tlabels.get(p, None), None)
                    # check if this task's quality is better than existing data
                    # 3 possibilities:
                    # i) materials property data not present, so this is best
                    # ii) task quality higher based on task label
                    # iii) task quality equal to materials; use lowest energy task
                    if not m_quality or t_quality > m_quality \
                            or (t_quality == m_quality
                                and taskdoc["output"]["energy_per_atom"] <
                                    self._materials.find_one({"material_id": m_id}, {
                                        "_tasksbuilder": 1})["_tasksbuilder"]["prop_metadata"][
                                        "energies"][p]):

                        # this task has better quality data
                        # figure out where the property data lives in the materials doc and
                        # in the task doc
                        materials_key = "{}.{}".format(x["materials_key"], p) \
                            if x.get("materials_key") else p
                        tasks_key = "{}.{}".format(x["tasks_key"], p) \
                            if x.get("tasks_key") else p

                        # insert property data AND metadata about this task
                        self._materials.\
                            update_one({"material_id": m_id},
                                       {"$set": {materials_key: get_mongolike(taskdoc, tasks_key),
                                                 "_tasksbuilder.prop_metadata.labels.{}".format(p): task_label,
                                                 "_tasksbuilder.prop_metadata.task_ids.{}".format(p): dbid_to_str(
                                                     self._t_prefix, taskdoc["task_id"]),
                                                 "_tasksbuilder.prop_metadata.energies.{}".format(p): taskdoc["output"]["energy_per_atom"],
                                                 "_tasksbuilder.updated_at": datetime.utcnow()}})

                        # copy property to document root if in properties_root
                        # i.e., intentionally duplicate some data to the root level
                        if p in self.properties_root:
                            self._materials.\
                            update_one({"material_id": m_id},
                                       {"$set": {p: get_mongolike(taskdoc, tasks_key)}})

        # update the database to reflect that this task_id was already processed
        self._materials.update_one({"material_id": m_id},
                                   {"$push": {"_tasksbuilder.all_task_ids": dbid_to_str(
                                       self._t_prefix, taskdoc["task_id"])}})