Beispiel #1
0
 def test_default_namespace_inheritance(self):
     prov_doc = ProvDocument()
     prov_doc.set_default_namespace('http://www.example.org/')
     bundle = prov_doc.bundle('bundle')
     e1 = bundle.entity('e1')
     self.assertIsNotNone(e1.identifier, "e1's identifier is None!")
     self.assertRoundTripEquivalence(prov_doc)
 def test_default_namespace_inheritance(self):
     prov_doc = ProvDocument()
     prov_doc.set_default_namespace("http://www.example.org/")
     bundle = prov_doc.bundle("bundle")
     e1 = bundle.entity("e1")
     self.assertIsNotNone(e1.identifier, "e1's identifier is None!")
     self.do_tests(prov_doc)
    def test_bundle_update_simple(self):
        doc = ProvDocument()
        doc.set_default_namespace(EX_URI)

        b1 = doc.bundle('b1')
        b1.entity('e')

        b2 = doc.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: b1.update(1))
        self.assertRaises(ProvException, lambda: b1.update(doc))

        b1.update(b2)
        self.assertEqual(len(b1.get_records()), 2)
Beispiel #4
0
    def test_bundle_update_simple(self):
        doc = ProvDocument()
        doc.set_default_namespace(EX_URI)

        b1 = doc.bundle('b1')
        b1.entity('e')

        b2 = doc.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: b1.update(1))
        self.assertRaises(ProvException, lambda: b1.update(doc))

        b1.update(b2)
        self.assertEqual(len(b1.get_records()), 2)
Beispiel #5
0
def create_graph(packages: List[Union[CommitModelPackage, ResourceModelPackage]]) -> ProvDocument:
    """Create graph from list of packages.

    Choose graph model according to package type.
    Remove duplicated specializationOf relations.
    """
    graph = ProvDocument()
    graph.set_default_namespace("gitlab2prov")

    if not packages:
        return graph

    model = {
        CommitModelPackage: commit_package_model,
        ResourceModelPackage: resource_package_model,
        ReleaseTagPackage: release_tag_model
    }[type(packages[0])]

    graph = model(graph, packages)
    graph = enforce_uniqueness_constraints(graph)
    return graph
    def test_document_update_simple(self):
        d1 = ProvDocument()
        d1.set_default_namespace(EX_URI)
        d1.entity('e')

        b1 = d1.bundle('b1')
        b1.entity('e')

        d2 = ProvDocument()
        d2.set_default_namespace(EX_URI)
        d2.entity('e')

        b1 = d2.bundle('b1')
        b1.entity('e')
        b2 = d2.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: d1.update(1))

        d1.update(d2)
        self.assertEqual(len(d1.get_records()), 2)
        self.assertEqual(len(d1.bundles), 2)
Beispiel #7
0
    def test_document_update_simple(self):
        d1 = ProvDocument()
        d1.set_default_namespace(EX_URI)
        d1.entity('e')

        b1 = d1.bundle('b1')
        b1.entity('e')

        d2 = ProvDocument()
        d2.set_default_namespace(EX_URI)
        d2.entity('e')

        b1 = d2.bundle('b1')
        b1.entity('e')
        b2 = d2.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: d1.update(1))

        d1.update(d2)
        self.assertEqual(len(d1.get_records()), 2)
        self.assertEqual(len(d1.bundles), 2)
Beispiel #8
0
    def create_prov(self):
        visits = []
        for visit_id in self.db_cursor.execute(
                "select visit_id from site_visits where crawl_id=?",
            [str(self.crawl_id)]):
            visits.append(visit_id[0])

        for visit_id in visits:
            try:
                self.db_cursor.execute(
                    "select command_status from crawl_history where visit_id=?",
                    [str(visit_id)])
                if self.db_cursor.fetchone(
                )[0] == 'error':  # Check if visit had an error
                    print('Error in visit %s' % visit_id)
                    continue
                document = ProvDocument()
                document.set_default_namespace('http://prj.com')
                self.documents[
                    visit_id] = document  # Save a blank document for this visit
            except:
                pass
Beispiel #9
0
class Provenance(object):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.doc = None
        self.workflow = None

    def start(self, workflow=False):
        from daops import __version__ as daops_version
        from housemartin import __version__ as housemartin_version

        self.doc = ProvDocument()
        # Declaring namespaces for various prefixes
        self.doc.set_default_namespace(uri="http://purl.org/roocs/prov#")
        self.doc.add_namespace("prov", uri="http://www.w3.org/ns/prov#")
        self.doc.add_namespace(
            "provone", uri="http://purl.dataone.org/provone/2015/01/15/ontology#"
        )
        self.doc.add_namespace("dcterms", uri="http://purl.org/dc/terms/")
        # Define entities
        project_cds = self.doc.agent(
            ":copernicus_CDS",
            {
                "prov:type": "prov:Organization",
                "dcterms:title": "Copernicus Climate Data Store",
            },
        )
        self.sw_housemartin = self.doc.agent(
            ":housemartin",
            {
                "prov:type": "prov:SoftwareAgent",
                "dcterms:source": f"https://github.com/cedadev/housemartin/releases/tag/v{housemartin_version}",
            },
        )
        self.doc.wasAttributedTo(self.sw_housemartin, project_cds)
        self.sw_daops = self.doc.agent(
            ":daops",
            {
                "prov:type": "prov:SoftwareAgent",
                "dcterms:source": f"https://github.com/roocs/daops/releases/tag/v{daops_version}",
            },
        )
        # workflow
        if workflow is True:
            self.workflow = self.doc.entity(
                ":workflow", {"prov:type": "provone:Workflow"}
            )
            orchestrate = self.doc.activity(
                ":orchestrate",
                other_attributes={
                    "prov:startedAtTime": "2020-11-26T09:15:00",
                    "prov:endedAtTime": "2020-11-26T09:30:00",
                },
            )
            self.doc.wasAssociatedWith(
                orchestrate, agent=self.sw_housemartin, plan=self.workflow
            )

    def add_operator(self, operator, parameters, collection, output):
        op = self.doc.activity(
            f":{operator}",
            other_attributes={
                ":time": parameters.get("time"),
                ":apply_fixes": parameters.get("apply_fixes"),
            },
        )
        # input data
        ds_in = os.path.basename(collection[0])
        # ds_in_attrs = {
        #     'prov:type': 'provone:Data',
        #     'prov:value': f'{ds_in}',
        # }
        op_in = self.doc.entity(f":{ds_in}")
        # operator started by daops
        if self.workflow:
            self.doc.wasAssociatedWith(op, agent=self.sw_daops, plan=self.workflow)
        else:
            self.doc.start(op, starter=self.sw_daops, trigger=self.sw_housemartin)
        # Generated output file

        ds_out = os.path.basename(output[0])
        # ds_out_attrs = {
        #     'prov:type': 'provone:Data',
        #     'prov:value': f'{ds_out}',
        # }
        op_out = self.doc.entity(f":{ds_out}")
        self.doc.wasDerivedFrom(op_out, op_in, activity=op)

    def write_json(self):
        outfile = os.path.join(self.output_dir, "provenance.json")
        self.doc.serialize(outfile, format="json")
        return outfile

    def write_png(self):
        outfile = os.path.join(self.output_dir, "provenance.png")
        figure = prov_to_dot(self.doc)
        figure.write_png(outfile)
        return outfile
Beispiel #10
0
def provlist2provdoc(provlist, default_ns=DEFAULT_NS):
    """ Convert a list of provenance dictionaries to a provdoc W3C PROV compatible"""
    pdoc = ProvDocument()
    pdoc.set_default_namespace("param:")
    pdoc.add_namespace(default_ns, default_ns + ":")
    pdoc.add_namespace("voprov", "voprov:")
    records = {}
    sess_id = ""
    for provdict in provlist:
        if "session_id" in provdict:
            sess_id = str(provdict.pop("session_id"))
            sess_qid = default_ns + ":" + sess_id
            if sess_id in records:
                sess = records[sess_qid]
            else:
                sess = pdoc.entity(sess_qid)
                records[sess_qid] = sess
            sess.add_attributes({
                "prov:label":
                "LogProvSession",
                "prov:type":
                "LogProvSession",
                "prov:generatedAtTime":
                provdict.pop("startTime"),
                #'configFile': provdict.pop('configFile'),
                'module':
                str(provdict.pop('module')),
                'class':
                str(provdict.pop('class')),
                'system':
                str(provdict.pop('system'))[:50],
                'definitions':
                str(provdict.pop('definitions'))[:50],
            })
        # activity
        if "activity_id" in provdict:
            act_id = default_ns + ":" + "_".join(
                [sess_id,
                 str(provdict.pop("activity_id")).replace("-", "")])
            if act_id in records:
                act = records[act_id]
            else:
                act = pdoc.activity(act_id)
                records[act_id] = act
            # activity name
            if "name" in provdict:
                act.add_attributes({"prov:label": provdict.pop("name")})
            # activity start
            if "startTime" in provdict:
                act.set_time(startTime=datetime.datetime.fromisoformat(
                    provdict.pop("startTime")))
            # activity end
            if "endTime" in provdict:
                act.set_time(endTime=datetime.datetime.fromisoformat(
                    provdict.pop("endTime")))
            # in session?
            # if "in_session" in provdict:
            #     sess_qid = default_ns + ":" + str(provdict.pop("in_session"])
            #     pdoc.wasInfluencedBy(
            #         act_id, sess_id
            #     )  # , other_attributes={'prov:type': "Context"})
            # activity configuration
            if "agent_name" in provdict:
                agent_id = str(provdict.pop("agent_name"))
                if ":" not in agent_id:
                    agent_id = default_ns + ":" + agent_id
                else:
                    new_ns = agent_id.split(":").pop(0)
                    pdoc.add_namespace(new_ns, new_ns + ":")
                if agent_id in records:
                    agent = records[agent_id]
                else:
                    agent = pdoc.agent(agent_id)
                    records[agent_id] = agent
                act.wasAssociatedWith(agent,
                                      attributes={"prov:role": "Operator"})
            if "parameters" in provdict:
                params_record = provdict.pop("parameters")
                params = {k: str(params_record[k]) for k in params_record}
                # par_id = act_id + "_parameters"
                # par = pdoc.entity(par_id, other_attributes=params)
                # par.add_attributes({"prov:type": "Parameters"})
                # par.add_attributes({"prov:label": "WasConfiguredBy"})
                # act.used(par, attributes={"prov:type": "Setup"})
                for name, value in params.items():
                    value_short = str(value)[:20]
                    if len(value_short) == 20:
                        value_short += "..."
                    par = pdoc.entity(act_id + "_" + name)
                    par.add_attributes(
                        {"prov:label": name + " = " + value_short})
                    par.add_attributes({"prov:type": "voprov:Parameter"})
                    par.add_attributes({"voprov:name": name})
                    par.add_attributes({"prov:value": value_short})
                    act.used(par, attributes={"prov:type": "Setup"})
            # usage
            if "used_id" in provdict:
                ent_id = str(provdict.pop("used_id"))
                if ":" not in ent_id:
                    ent_id = default_ns + ":" + "_".join([sess_id, ent_id])
                else:
                    new_ns = ent_id.split(":").pop(0)
                    pdoc.add_namespace(new_ns, new_ns + ":")
                if ent_id in records:
                    ent = records[ent_id]
                else:
                    ent = pdoc.entity(ent_id)
                    records[ent_id] = ent
                rol = provdict.pop("used_role", None)
                # if rol:
                #     ent.add_attributes({'prov:label': rol})
                act.used(ent, attributes={"prov:role": rol})
            # generation
            if "generated_id" in provdict:
                ent_id = str(provdict.pop("generated_id"))
                if ":" not in ent_id:
                    ent_id = default_ns + ":" + "_".join([sess_id, ent_id])
                else:
                    new_ns = ent_id.split(":").pop(0)
                    pdoc.add_namespace(new_ns, new_ns + ":")
                if ent_id in records:
                    ent = records[ent_id]
                else:
                    ent = pdoc.entity(ent_id)
                    records[ent_id] = ent
                rol = provdict.pop("generated_role", None)
                # if rol:
                #     ent.add_attributes({'prov:label': rol})
                ent.wasGeneratedBy(act, attributes={"prov:role": rol})
            for k, v in provdict.items():
                act.add_attributes({k: str(v)})
        # entity
        if "entity_id" in provdict:
            ent_id = str(provdict.pop("entity_id"))
            label = ""
            if ":" not in ent_id:
                ent_id = default_ns + ":" + "_".join([sess_id, ent_id])
            else:
                new_ns = ent_id.split(":").pop(0)
                pdoc.add_namespace(new_ns, new_ns + ":")
            if ent_id in records:
                ent = records[ent_id]
            else:
                ent = pdoc.entity(ent_id)
                records[ent_id] = ent
            if "name" in provdict:
                label = provdict.pop("name")
                ent.add_attributes({"voprov:name": label})
            if "entity_description" in provdict:
                label = provdict.pop("entity_description")
                ent.add_attributes({"voprov:entity_description": label})
            if "type" in provdict:
                ent.add_attributes({"prov:type": provdict.pop("type")})
            if "value" in provdict:
                value_short = str(provdict.pop("value"))[:20]
                if len(value_short) == 20:
                    value_short += "..."
                ent.add_attributes({"prov:value": value_short})
            if "location" in provdict:
                location = str(provdict.pop("location"))
                ent.add_attributes({"prov:location": location})
                if label:
                    label = label + " in " + location
            if label:
                ent.add_attributes({"prov:label": label})
            if "generated_time" in provdict:
                ent.add_attributes({
                    "prov:generatedAtTime":
                    str(provdict.pop("generated_time"))
                })
            # member
            if "member_id" in provdict:
                mem_id = str(provdict.pop("member_id"))
                if ":" not in mem_id:
                    mem_id = default_ns + ":" + "_".join([sess_id, mem_id])
                else:
                    new_ns = mem_id.split(":").pop(0)
                    pdoc.add_namespace(new_ns, new_ns + ":")
                if mem_id in records:
                    mem = records[mem_id]
                else:
                    mem = pdoc.entity(mem_id)
                    records[mem_id] = mem
                ent.hadMember(mem)
            if "progenitor_id" in provdict:
                progen_id = str(provdict.pop("progenitor_id"))
                if ":" not in progen_id:
                    progen_id = default_ns + ":" + "_".join(
                        [sess_id, progen_id])
                else:
                    new_ns = progen_id.split(":").pop(0)
                    pdoc.add_namespace(new_ns, new_ns + ":")
                if progen_id in records:
                    progen = records[progen_id]
                else:
                    progen = pdoc.entity(progen_id)
                    records[progen_id] = progen
                ent.wasDerivedFrom(progen)
            for k, v in provdict.items():
                ent.add_attributes({k: str(v)})
        # agent
    return pdoc
Beispiel #11
0
class Context(object):
    """
    Context is a singlton storing all
    of the run specific data.
    """
    def __init__(self):
        # Warning;
        # If new data is added with a site dimension the
        # clip exposure function may need to be updated
        # so the site data stays consistent.

        # --------------  These variables are saved ----
        #  If new variables are added the save functions
        # will need to be modified.

        # Latitude and longitude values of the exposure data
        # Has a site dimension
        self.exposure_lat = None
        self.exposure_long = None

        # Data with a site dimension
        # key - data name
        # value - A numpy array. First dimension is site. (0 axis)
        # Has a site dimension
        self.exposure_att = None

        # Data for aggregation across sites
        self.exposure_agg = None

        #
        # --------------  The above variables are saved ----

        # key - intensity measure
        # value - One instance of RealisedVulnerabilityCurves.  An att in this
        #         class has a site dimension.
        self.exposure_vuln_curves = None

        # A dictionary of the vulnerability sets.
        # Not associated with exposures.
        # key - vulnerability set ID
        # value - vulnerability set instance
        self.vulnerability_sets = {}

        # A dictionary with keys being vulnerability_set_ids and
        # value being the exposure attribute who's values are vulnerability
        # function ID's.
        self.vul_function_titles = {}

        # A `prov.ProvDocument` to manage provenance information, including
        # adding required namespaces
        self.prov = ProvDocument()
        self.prov.set_default_namespace("")
        self.prov.add_namespace('prov', 'http://www.w3.org/ns/prov#')
        self.prov.add_namespace('xsd', 'http://www.w3.org/2001/XMLSchema#')
        self.prov.add_namespace('foaf', 'http://xmlns.com/foaf/0.1/')
        self.prov.add_namespace('void', 'http://vocab.deri.ie/void#')
        self.prov.add_namespace('dcterms', 'http://purl.org/dc/terms/')

        commit, branch, dt = misc.get_git_commit()
        # Create the fundamental software agent that is this code:
        self.prov.agent(
            ":hazimp", {
                "prov:type": "prov:SoftwareAgent",
                "prov:Revision": commit,
                "prov:branch": branch,
                "prov:date": dt
            })
        self.prov.agent(f":{getpass.getuser()}", {"prov:type": "foaf:Person"})
        self.prov.actedOnBehalfOf(":hazimp", f":{getpass.getuser()}")
        self.provlabel = ''

    def set_prov_label(self, label, title="HazImp analysis"):
        """
        Set the qualified label for the provenance data
        """

        self.provlabel = f":{label}"
        self.prov.activity(f":{label}",
                           datetime.now().strftime(DATEFMT), None, {
                               "dcterms:title": title,
                               "prov:type": "void:Analysis"
                           })
        self.prov.wasAttributedTo(self.provlabel, ":hazimp")

    def get_site_shape(self):
        """
        Get the numpy shape of sites the context is storing.
        It is based on the shape of exposure_long.

        :return: The numpy shape of sites the context is storing.
        """
        if self.exposure_long is None:
            shape = (0)
        else:
            shape = self.exposure_long.shape
        return shape

    def clip_exposure(self, min_long, min_lat, max_long, max_lat):
        """ min_long, min_lat, max_long, max_lat
        Clip the exposure data so only the exposure values within
        the rectangle formed by  max_lat, min_lat, max_long and
        min_long are included.

        Note: This must be called before the exposure_vuln_curves
        are determined, since the curves have a site dimension.
        """
        assert self.exposure_vuln_curves is None

        bad_indexes = set()
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_long < min_long)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_long > max_long)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_lat < min_lat)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_lat > max_lat)[0])
        good_indexes = numpy.array(list(
            set(range(self.exposure_lat.size)).difference(bad_indexes)),
                                   dtype=int)

        if good_indexes.shape[0] == 0:
            self.exposure_lat = numpy.array([])
            self.exposure_long = numpy.array([])
        else:
            self.exposure_lat = self.exposure_lat[good_indexes]
            self.exposure_long = self.exposure_long[good_indexes]

        if isinstance(self.exposure_att, dict):
            for key in self.exposure_att:
                if good_indexes.shape[0] == 0:
                    exp_att = numpy.array([])
                else:
                    exp_att = self.exposure_att[key][good_indexes]
                self.exposure_att[key] = exp_att
        else:
            self.exposure_att = self.exposure_att.take(good_indexes)

    def save_exposure_atts(self, filename, use_parallel=True):
        """
        Save the exposure attributes, including latitude and longitude.
        The file type saved is based on the filename extension.
        Options
           '.npz': Save the arrays into a single file in uncompressed .npz
                   format.

        :param use_parallel: Set to True for parallel behaviour
        Which is only node 0 writing to file.
        :param filename: The file to be written.
        :return write_dict: The whole dictionary, returned for testing.
        """
        [filename, bucket_name, bucket_key] = \
            misc.create_temp_file_path_for_s3(filename)
        s1 = self.prov.entity(
            ":HazImp output file", {
                "prov:label": "Full HazImp output file",
                "prov:type": "void:Dataset",
                "prov:atLocation": os.path.basename(filename)
            })
        a1 = self.prov.activity(":SaveImpactData",
                                datetime.now().strftime(DATEFMT), None)
        self.prov.wasGeneratedBy(s1, a1)
        self.prov.wasInformedBy(a1, self.provlabel)
        write_dict = self.exposure_att.copy()
        write_dict[EX_LAT] = self.exposure_lat
        write_dict[EX_LONG] = self.exposure_long

        if use_parallel:
            assert misc.INTID in write_dict
            write_dict = parallel.gather_dict(write_dict,
                                              write_dict[misc.INTID])

        if parallel.STATE.rank == 0 or not use_parallel:
            if filename[-4:] == '.csv':
                save_csv(write_dict, filename)
            else:
                numpy.savez(filename, **write_dict)
            misc.upload_to_s3_if_applicable(filename, bucket_name, bucket_key)
            # The write_dict is returned for testing
            # When running in paralled this is a way of getting all
            # of the context info
            return write_dict

    def save_exposure_aggregation(self, filename, use_parallel=True):
        """
        Save the aggregated exposure attributes.
        The file type saved is based on the filename extension.
        Options
           '.npz': Save the arrays into a single file in uncompressed .npz
                   format.

        :param use_parallel: Set to True for parallel behaviour which
        is only node 0 writing to file.
        :param filename: The file to be written.
        :return write_dict: The whole dictionary, returned for testing.
        """
        write_dict = self.exposure_agg.copy()

        s1 = self.prov.entity(
            ":Aggregated HazImp output file", {
                "prov:label": "Aggregated HazImp output file",
                "prov:type": "void:Dataset",
                "prov:atLocation": os.path.basename(filename)
            })
        a1 = self.prov.activity(":SaveAggregatedImpactData",
                                datetime.now().strftime(DATEFMT), None)
        self.prov.wasGeneratedBy(s1, a1)
        self.prov.wasInformedBy(a1, self.prov.activity(":AggregateLoss"))

        if parallel.STATE.rank == 0 or not use_parallel:
            if filename[-4:] == '.csv':
                save_csv_agg(write_dict, filename)
            else:
                numpy.savez(filename, **write_dict)
            # The write_dict is returned for testing
            # When running in paralled this is a way of getting all
            # of the context info
            return write_dict

    def save_aggregation(self,
                         filename,
                         boundaries,
                         impactcode,
                         boundarycode,
                         categories,
                         use_parallel=True):
        """
        Save data aggregated to geospatial regions

        :param str filename: Destination filename
        :param bool use_parallel: True for parallel behaviout, which
                                  is only node 0 writing to file

        """
        LOGGER.info("Saving aggregated data")
        boundaries = misc.download_file_from_s3_if_needed(boundaries)
        [filename, bucket_name, bucket_key] = \
            misc.create_temp_file_path_for_s3(filename)
        write_dict = self.exposure_att.copy()
        dt = datetime.now().strftime(DATEFMT)
        atts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(boundaries),
            "prov:generatedAtTime": misc.get_file_mtime(boundaries),
            "void:boundary_code": boundarycode
        }
        bdyent = self.prov.entity(":Aggregation boundaries", atts)
        aggact = self.prov.activity(":AggregationByRegions", dt, None,
                                    {'prov:type': "Spatial aggregation"})
        aggatts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(filename),
            "prov:generatedAtTime": dt
        }
        aggfileent = self.prov.entity(":AggregationFile", aggatts)
        self.prov.used(aggact, bdyent)
        self.prov.wasInformedBy(aggact, self.provlabel)
        self.prov.wasGeneratedBy(aggfileent, aggact)
        if parallel.STATE.rank == 0 or not use_parallel:
            aggregate.choropleth(write_dict, boundaries, impactcode,
                                 boundarycode, filename, categories)
            misc.upload_to_s3_if_applicable(filename, bucket_name, bucket_key)
            if (bucket_name is not None and bucket_key is not None
                    and bucket_key.endswith('.shp')):
                [rootname, ext] = os.path.splitext(filename)
                base_bucket_key = bucket_key[:-len(ext)]
                misc.upload_to_s3_if_applicable(rootname + '.dbf', bucket_name,
                                                base_bucket_key + '.dbf')
                misc.upload_to_s3_if_applicable(rootname + '.shx', bucket_name,
                                                base_bucket_key + '.shx')
                misc.upload_to_s3_if_applicable(rootname + '.prj', bucket_name,
                                                base_bucket_key + '.prj')
                misc.upload_to_s3_if_applicable(rootname + '.cpg', bucket_name,
                                                base_bucket_key + '.cpg', True)
        else:
            pass

    def aggregate_loss(self, groupby=None, kwargs=None):
        """
        Aggregate data by the `groupby` attribute, using the `kwargs` to
        perform any arithmetic aggregation on fields (e.g. summation,
        mean, etc.)

        :param str groupby: A column in the `DataFrame` that corresponds to
        regions by which to aggregate data
        :param dict kwargs: A `dict` with keys of valid column names (from the
        `DataFrame`) and values being lists of aggregation functions to apply
        to the columns.

        For example::

        kwargs = {'REPLACEMENT_VALUE': ['mean', 'sum'],
                'structural_loss_ratio': ['mean', 'std']}


        See
        https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#aggregation
        for more guidance on using aggregation with `DataFrames`

        """
        LOGGER.info(f"Aggregating loss using {groupby} attribute")
        a1 = self.prov.activity(":AggregateLoss",
                                datetime.now().strftime(DATEFMT), None, {
                                    "prov:type": "Aggregation",
                                    "void:aggregator": repr(groupby)
                                })
        self.prov.wasInformedBy(a1, self.provlabel)
        self.exposure_agg = aggregate.aggregate_loss_atts(
            self.exposure_att, groupby, kwargs)

    def categorise(self, bins, labels, field_name):
        """
        Bin values into discrete intervals.

        :param list bins: Monotonically increasing array of bin edges,
                          including the rightmost edge, allowing for
                          non-uniform bin widths.
        :param labels: Specifies the labels for the returned
                       bins. Must be the same length as the resulting bins.
        :param str field_name: Name of the new column in the `exposure_att`
                                `DataFrame`
        """

        for intensity_key in self.exposure_vuln_curves:
            vc = self.exposure_vuln_curves[intensity_key]
            lct = vc.loss_category_type
        LOGGER.info(f"Categorising {lct} values into {len(labels)} categories")
        self.exposure_att[field_name] = pd.cut(self.exposure_att[lct],
                                               bins,
                                               right=False,
                                               labels=labels)

    def tabulate(self, file_name, index=None, columns=None, aggfunc=None):
        """
        Reshape data (produce a "pivot" table) based on column values. Uses
        unique values from specified `index` / `columns` to form axes of the
        resulting DataFrame, then writes to an Excel file. This function does
        not support data aggregation - multiple values will result in a
        MultiIndex in the columns.
        See
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
        for further details.

        Parameters
        ----------
        file_name : destination for the pivot table
        index : column or list of columns
            Keys to group by on the pivot table index.  If an array is passed,
            it is being used as the same manner as column values.
        columns : column, or list of the columns
            Keys to group by on the pivot table column.  If an array is passed,
            it is being used as the same manner as column values.
        aggfunc : function, list of functions, dict, default numpy.mean
            If list of functions passed, the resulting pivot table will have
            hierarchical columns whose top level are the function names
            (inferred from the function objects themselves)
            If dict is passed, the key is column to aggregate and value
            is function or list of functions.
        """
        if index not in self.exposure_att.columns:
            LOGGER.error(f"Cannot tabulate data using {index} as index")
            LOGGER.error(f"{index} is not an attribute of the exposure data")
            return

        if columns not in self.exposure_att.columns:
            LOGGER.error(
                f"Required attribute(s) {columns} not in the exposure data")
            LOGGER.error(
                "Maybe you need to run a categorise job before this one?")
            return

        dt = datetime.now().strftime(DATEFMT)
        a1 = self.prov.activity(
            ":Tabulate", dt, None, {
                "prov:type": "Tabulation",
                "void:aggregator": repr(index),
                "void:attributes": repr(columns),
                "void:aggregation": repr(aggfunc)
            })
        tblatts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(file_name),
            "prov:generatedAtTime": dt
        }
        tblfileent = self.prov.entity(":TabulationFile", tblatts)

        self.pivot = self.exposure_att.pivot_table(index=index,
                                                   columns=columns,
                                                   aggfunc=aggfunc,
                                                   fill_value=0)
        try:
            self.pivot.to_excel(file_name)
        except TypeError as te:
            LOGGER.error(te)
            raise
        except KeyError as ke:
            LOGGER.error(ke)
            raise
        except ValueError as ve:
            LOGGER.error(f"Unable to save tabulated data to {file_name}")
            LOGGER.error(ve)
        else:
            self.prov.wasGeneratedBy(tblfileent, a1)
            self.prov.wasInformedBy(a1, self.provlabel)
Beispiel #12
0
def get_prov():
    doc = ProvDocument()
    doc.set_default_namespace('http://roocs.org/')
    doc.get_provn()
Beispiel #13
0
"""PROV model fpr GitLab2PROV."""

__author__ = "Claas de Boer, Andreas Schreiber, Lynn von Kurnatowski"
__copyright__ = "Copyright 2020, German Aerospace Center (DLR) and individual contributors"
__license__ = "MIT"
__version__ = "0.5"
__status__ = "Development"

from prov.model import ProvDocument
from prov.constants import PROV_LABEL
from prov.dot import prov_to_dot

add = ProvDocument()
add.set_default_namespace("gitlab2prov:")
add.activity("Commit",
             other_attributes={
                 "prov:type": "commit",
                 "title": "",
                 "message": "",
                 "id": "",
                 "short_id": "",
                 "prov:startedAt": "",
                 "prov:endedAt": ""
             })
add.activity("Parent Commit",
             other_attributes={
                 "prov:type": "commit",
                 "title": "",
                 "message": "",
                 "id": "",
                 "short_id": "",
def prov_default_namespace_example(ns_postfix: str):
    doc = ProvDocument()
    doc.set_default_namespace("https://example.com/{0}".format(ns_postfix))
    doc.entity(identifier="Entity1")
    return doc
Beispiel #15
0
    def write_targets_prov(self, tlist, C, bundle_id):
        #Initialisation
#         cs = b.agent('CrowdScanner')
         
        if self.document_id == -1:
            d = ProvDocument()
            d.add_namespace(AO)
            d.set_default_namespace(self.defaultns % self.game_id)
            if uploadprov:
                provstore_document = self.api.document.create(d, name="Operation%s CrowdScanner" % self.game_id, public=True)
                document_uri = provstore_document.url
                logging.info("prov doc URI: " + str(document_uri))
                self.provfilelist.append(provstore_document.id)
                self.savelocalrecord()
                self.document_id = provstore_document.id
         
        b = ProvDocument()  # Create a new document for this update
        b.add_namespace(AO)
        b.set_default_namespace(self.defaultns % self.game_id)            
            
        # cs to be used with all targets
        cs = b.agent('agent/CrowdScanner', (('prov:type', AO['IBCCAlgo']), ('prov:type', PROV['SoftwareAgent'])))
        
        timestamp = time.time()  # Record the timestamp at each update to generate unique identifiers        
        startTime = datetime.datetime.fromtimestamp(timestamp)
        endTime = startTime
        activity = b.activity('activity/cs/update_report_%s' % timestamp, startTime, endTime)
        activity.wasAssociatedWith(cs)

        #Add target and report entities
        for i, tdata in enumerate(tlist):
            if self.changedtargets[i]==0:
                continue
            
            #Target entity for target i
            tid = int(tdata[0])
            x = tdata[1]
            y = tdata[2]
#             targettype = tdata[3] #don't record here, it will be revealed and recorded by UAVs
            v = int(tdata[4])
            agentids = tdata[7]
            
            targetattributes = {'ao:longitude': x, 'ao:latitude': y, }
            #'ao:asset_type':str(targettype)}
            target_v0 = b.entity('cs/target/'+str(tid)+'.'+str(v), targetattributes)            
            #Post the root report if this is the first version
            if v==0:
                self.targets[tid] = b.entity('cs/target/'+str(tid))
            else:
                try:
                    target_v0.wasDerivedFrom(self.targetversions[tid])
                except KeyError:
                    logging.error("Got a key error for key " + str(tid) + ', which is supposed to be version' + str(v))
            self.targetversions[tid] = target_v0                    
            target_v0.specializationOf(self.targets[tid])
            target_v0.wasAttributedTo(cs)
            
            #Report entities for origins of target i
            for j, r in enumerate(self.target_rep_ids[i]):
                if r not in self.postedreports:
                    Crow = C[r,:]
                    x = Crow[1]
                    y = Crow[2]
                    reptext = tdata[5][j].decode('utf8')
                    # Try to replace unusual characters
                    reptext = reptext.encode('ascii', 'replace')  
                    agentid = agentids[j]
                    
                    reporter_name = 'agent/crowdreporter%s' % agentid
                    b.agent(reporter_name, (('prov:type', AO['CrowdReporter']), ('prov:type', PROV['Person'])))
                    
                    reportattributes = {'ao:longitude': x, 'ao:latitude': y, 'ao:report': reptext}
                    
                    self.postedreports[r] = b.entity('cs/report/'+str(r), reportattributes)
                    self.postedreports[r].wasAttributedTo(reporter_name)
                activity.used(self.postedreports[r])
                target_v0.wasDerivedFrom(self.postedreports[r])
        
        if uploadprov:
            #Invalidate old targets no longer in use
            for i,tid in enumerate(self.targets_to_invalidate):
                target_v = self.targetversions[tid]
                b.wasInvalidatedBy(target_v, activity)
            #Post the document to the server
            #bundle = b.bundle('crowd_scanner')
            bundle_id = 'bundle/csupdate/%s' % timestamp
            self.api.add_bundle(self.document_id, b.serialize(), bundle_id)