Python ETK.process_ems Exemples, etk.etk.ETK.process_ems Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : inferlink_example.py Projet : xkgoodbest/etk

    """
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.inferlink_extractor = InferlinkExtractor(
            InferlinkRuleSet(
                InferlinkRuleSet.load_rules_file(
                    '../html_basic/sample_inferlink_rules.json')))

    def process_document(self, doc):
        """
        Add your code for processing the document
        """

        raw = doc.select_segments("$.raw_content")[0]
        extractions = doc.extract(self.inferlink_extractor, raw)
        doc.store(extractions, "inferlink_extraction")
        return list()


if __name__ == "__main__":
    sample_html = json.load(codecs.open('../html_basic/sample_html.json',
                                        'r'))  # read sample file from disk

    etk = ETK(modules=InferlinkETKModule)
    doc = etk.create_document(sample_html,
                              mime_type="text/html",
                              url="http://ex.com/123")

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].value, indent=2))

Exemple #2

0

Afficher le fichier

            },
            "matched_sentence": {
                "type": "string"
            },
            "date": {
                "type": "string"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./"])

    # read the news
    news_file = open(
        '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl'
    )
    # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl')
    news_stories = [
        etk.create_document(json.loads(line),
                            url=json.loads(line)['tld'],
                            doc_id=json.loads(line)['doc_id'])
        for line in news_file
    ]
    results = list()
    for news_story in news_stories:
        results.extend(etk.process_ems(news_story))
    o = open('ifp_news_similarity.jl', 'w')
    for result in results:
        o.write(json.dumps(result.value))
        o.write('\n')

Exemple #3

0

Afficher le fichier

Fichier : generator.py Projet : yyht/kgtk

class TripleGenerator(Generator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        prop_declaration = kwargs.pop("prop_declaration")
        dest_fp = kwargs.pop("dest_fp")
        truthy = kwargs.pop("truthy")
        use_id = kwargs.pop("use_id")
        prefix_path = kwargs.pop("prefix_path")
        self.datatype_mapping = {
            # nomenclature from https://w.wiki/Tfn
            "item": Item,
            "WikibaseItem": Item,
            "time": TimeValue,
            "Time": TimeValue,
            "globe-coordinate": GlobeCoordinate,
            "GlobeCoordinate": GlobeCoordinate,
            "quantity": QuantityValue,
            "Quantity": QuantityValue,
            "monolingualtext": MonolingualText,
            "Monolingualtext": MonolingualText,
            "string": StringValue,
            "String": StringValue,
            "external-identifier": ExternalIdentifier,
            "ExternalId": ExternalIdentifier,
            "url": StringValue,  #TODO bug potentially in rdflib
            "Url": StringValue,
            "property": WDProperty,
            "WikibaseProperty": WDProperty
        }
        self.set_prefix(prefix_path)
        self.prop_declaration = prop_declaration
        self.set_properties(self.prop_file)
        self.fp = dest_fp
        self.truthy = truthy
        self.reset_etk_doc()
        self.serialize_prefix()
        self.use_id = use_id

    def set_prefix(self, prefix_path: str):
        self.prefix_dict = {}
        if prefix_path != "NONE":
            with open(prefix_path, "r") as fp:
                for line_num, edge in enumerate(fp):
                    edge_list = edge.strip("\r\n").split("\t")
                    if line_num == 0:
                        node1_index, node2_index = edge_list.index(
                            "node1"), edge_list.index("node2")
                    else:
                        prefix, expand = edge_list[node1_index], edge_list[
                            node2_index]
                        self.prefix_dict[prefix] = expand

    def read_prop_declaration(self, line_number: int, edge: str):
        node1, node2, prop, e_id = self.parse_edges(edge)
        if prop == "data_type":
            self.prop_types[node1] = self.datatype_mapping[node2.strip()]
        return

    def set_properties(self, prop_file: str):
        self.prop_types = {}
        if prop_file == "NONE":
            return

        with open(prop_file, "r") as fp:
            props = fp.readlines()
        for line in props[1:]:
            node1, _, node2 = line.split("\t")
            try:
                self.prop_types[node1] = self.datatype_mapping[node2.strip()]
            except:
                raise KGTKException(
                    "DataType {} of node {} is not supported.\n".format(
                        node2, node1))

    def _node_2_entity(self, node: str):
        '''
        A node can be Qxxx or Pxxx, return the proper entity.
        '''
        if node in self.prop_types:
            entity = WDProperty(node, self.prop_types[node])
        else:
            entity = WDItem(TripleGenerator.replace_illegal_string(node))
        return entity

    def reset_etk_doc(self,
                      doc_id: str = "http://isi.edu/default-ns/projects"):
        """
        reset the doc object and return it. Called at initialization and after outputting triples.
        """
        kg_schema = KGSchema()
        kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
        self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        self.doc = self.etk.create_document({}, doc_id=doc_id)
        for k, v in wiki_namespaces.items():
            if k in self.prefix_dict:
                self.doc.kg.bind(k, self.prefix_dict[k])
            else:
                self.doc.kg.bind(k, v)

    def serialize(self):
        """
        Seriealize the triples. Used a hack to avoid serializing the prefix again.
        """
        docs = self.etk.process_ems(self.doc)
        self.fp.write("\n\n".join(
            docs[0].kg.serialize("ttl").split("\n\n")[1:]))
        self.fp.flush()
        self.reset()

    def serialize_prefix(self):
        """
        This function should be called only once after the doc object is initialized.
        In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0
        Relevent issue: https://github.com/RDFLib/rdflib/issues/965
        """
        for k, v in wiki_namespaces.items():
            if k in self.prefix_dict:
                line = "@prefix " + k + ": <" + self.prefix_dict[k] + "> .\n"
            else:
                line = "@prefix " + k + ": <" + v + "> .\n"
            self.fp.write(line)
        self.fp.write("\n")
        self.fp.flush()
        self.reset()

    def reset(self):
        self.to_append_statement_id = None
        self.to_append_statement = None
        self.read_num_of_lines = 0
        self.reset_etk_doc()

    def generate_label_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_label(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_description_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_description(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_alias_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_alias(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool:
        # update the known prop_types
        if node1 in self.prop_types:
            if not self.prop_declaration:
                raise KGTKException(
                    "Duplicated property definition of {} found!".format(
                        node1))
        else:
            self.prop_types[node1] = node2

        prop = WDProperty(node1, self.datatype_mapping[node2])
        self.doc.kg.add_subject(prop)
        return True

    def generate_normal_triple(self, node1: str, property: str, node2: str,
                               is_qualifier_edge: bool, e_id: str) -> bool:
        if self.use_id:
            e_id = TripleGenerator.replace_illegal_string(e_id)
        entity = self._node_2_entity(node1)
        edge_type = self.prop_types[property]
        if edge_type == Item:
            object = WDItem(TripleGenerator.replace_illegal_string(node2))
        elif edge_type == WDProperty:
            object = WDProperty(TripleGenerator.replace_illegal_string(node2),
                                self.prop_types[node2])

        elif edge_type == TimeValue:
            if self.yyyy_mm_dd_pattern.match(node2):
                try:
                    dateTimeString = node2
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            elif self.yyyy_pattern.match(node2):
                try:
                    dateTimeString = node2 + "-01-01"
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            else:
                try:
                    # TODO, in future, the two cases above will be dropped in principle to comply with the iso format
                    # now it is iso format
                    assert (node2[0] == "^")
                    node2 = node2[1:]  # remove ^
                    if node2.startswith("+"):
                        node2 = node2[1:]
                    dateTimeString, precision = node2.split("/")
                    dateTimeString = dateTimeString[:-1]  # remove Z
                    object = TimeValue(
                        value=dateTimeString,
                        calendar=Item("Q1985727"),
                        precision=precision,
                        time_zone=0,
                    )
                except:
                    return False

        elif edge_type == GlobeCoordinate:
            latitude, longitude = node2[1:].split("/")
            latitude = float(latitude)
            longitude = float(longitude)
            object = GlobeCoordinate(latitude,
                                     longitude,
                                     0.0001,
                                     globe=Item("Q2"))  # earth

        elif edge_type == QuantityValue:
            # +70[+60,+80]Q743895
            res = self.quantity_pattern.match(node2).groups()
            amount, lower_bound, upper_bound, unit = res

            amount = TripleGenerator.clean_number_string(amount)
            num_type = self.xsd_number_type(amount)

            lower_bound = TripleGenerator.clean_number_string(lower_bound)
            upper_bound = TripleGenerator.clean_number_string(upper_bound)
            if unit != None:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           type=num_type)
            else:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount, type=num_type)

        elif edge_type == MonolingualText:
            text_string, lang = TripleGenerator.process_text_string(node2)
            object = MonolingualText(text_string, lang)
        elif edge_type == ExternalIdentifier:
            object = ExternalIdentifier(node2)
        elif edge_type == URLValue:
            if TripleGenerator.is_valid_uri_with_scheme_and_host(node2):
                object = URLValue(node2)
            else:
                return False
        else:
            # treat everything else as stringValue
            object = StringValue(node2)

        if type(object) == WDItem or type(object) == WDProperty:
            self.doc.kg.add_subject(object)

        if is_qualifier_edge:
            # edge: e8 p9 ^2013-01-01T00:00:00Z/11
            # create qualifier edge on previous STATEMENT and return the updated STATEMENT
            self.to_append_statement.add_qualifier(property, object)
            self.doc.kg.add_subject(self.to_append_statement)
        else:
            # edge: q1 p8 q2 e8
            # create brand new property edge and replace STATEMENT
            if self.truthy:
                self.to_append_statement = entity.add_truthy_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_truthy_statement(
                    property, object)
            else:
                self.to_append_statement = entity.add_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_statement(property, object)
            self.doc.kg.add_subject(entity)
        return True

    def entry_point(self, line_number: int, edge: str):
        # print(line_number,edge)
        """
        generates a list of two, the first element is the determination of the edge type using corresponding edge type
        the second element is a bool indicating whether this is a valid property edge or qualifier edge.
        Call corresponding downstream functions
        """
        if line_number == 1:
            # initialize the order_map
            self.initialize_order_map(edge)
            return

        # use the order_map to map the node
        node1, node2, prop, e_id = self.parse_edges(edge)
        if line_number == 2:
            # by default a statement edge
            is_qualifier_edge = False
        else:
            if node1 != self.to_append_statement_id and node1 != self.corrupted_statement_id:
                is_qualifier_edge = False
                # also a new statement edge
                if self.read_num_of_lines >= self.n:
                    self.serialize()
            else:
                # qualifier edge or property declaration edge
                is_qualifier_edge = True
                if node1 == self.corrupted_statement_id:
                    self.warn_log.write(
                        "QUALIFIER edge at line [{}] associated of corrupted statement edge of id [{}] dropped.\n"
                        .format(line_number, self.corrupted_statement_id))
                    return
        if prop in self.label_set:
            success = self.generate_label_triple(node1, node2)
        elif prop in self.description_set:
            success = self.generate_description_triple(node1, node2)
        elif prop in self.alias_set:
            success = self.generate_alias_triple(node1, node2)
        elif prop == "data_type":
            # special edge of prop declaration
            success = self.generate_prop_declaration_triple(node1, node2)
        else:
            if prop in self.prop_types:
                success = self.generate_normal_triple(node1, prop, node2,
                                                      is_qualifier_edge, e_id)
            else:
                raise KGTKException(
                    "property [{}]'s type is unknown at line [{}].\n".format(
                        prop, line_number))
        if (not success) and self.warning:
            if not is_qualifier_edge:
                self.warn_log.write(
                    "CORRUPTED_STATEMENT edge at line: [{}] with edge id [{}].\n"
                    .format(line_number, e_id))
                self.corrupted_statement_id = e_id
            else:
                self.warn_log.write(
                    "CORRUPTED_QUALIFIER edge at line: [{}] with edge id [{}].\n"
                    .format(line_number, e_id))

        else:
            self.read_num_of_lines += 1
            if not is_qualifier_edge:
                self.to_append_statement_id = e_id

    @staticmethod
    def xsd_number_type(num):
        if isinstance(num, float) and 'e' in str(num).lower():
            return LiteralType.double
        return LiteralType.decimal

Exemple #4

0

Afficher le fichier

                    if extractions:
                        path = '$."' + \
                               extractions[0].value + '"[?(@.country == "Italy")]'
                        jsonpath_expr = jex.parse(path)
                        city_match = jsonpath_expr.find(self.city_dataset)
                        if city_match:
                            # add corresponding values of city_dataset into knowledge graph of the doc
                            for field in city_match[0].value:
                                doc.kg.add_value(
                                    field, value=city_match[0].value[field])
                    new_docs.append(doc)
        return new_docs

    def document_selector(self, doc) -> bool:
        return doc.cdr_document.get("dataset") == "italy_team"


if __name__ == "__main__":
    # url = 'https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy'

    cdr = json.load(
        open('./resources/italy_teams.json', mode='r', encoding='utf-8'))
    kg_schema = KGSchema(json.load(open('./resources/master_config.json')))
    etk = ETK(modules=ItalyTeamsModule, kg_schema=kg_schema)
    etk.parser = jex.parse
    cdr_doc = Document(etk, cdr_document=cdr, mime_type='json', url=cdr['url'])
    results = etk.process_ems(cdr_doc)[1:]
    print('Total docs:', len(results))
    print("Sample result:\n")
    print(json.dumps(results[0].value, indent=2))

Exemple #5

0

Afficher le fichier

Fichier : sentence_extraction_example.py Projet : xkgoodbest/etk

    parser.add_option("-o",
                      "--output_file",
                      action="store",
                      type="string",
                      dest="output_file")
    (c_options, args) = parser.parse_args()

    input_file = c_options.input_file
    output_file = c_options.output_file

    f = open(input_file, mode='r', encoding='utf-8')
    o = open(output_file, mode='w', encoding='utf-8')
    l = open('{}.log'.format(output_file), mode='w', encoding='utf-8')
    print('Starting to process file: {}'.format(input_file))
    count = 0
    sum = 0
    for line in f:
        if count == 10000:
            sum += count
            l.write('Processed {} lines'.format(str(sum)))
            l.write('\n')
            count = 0
        json_x = json.loads(line)
        doc = etk.create_document(json_x)
        doc.doc_id = json_x['doc_id']
        sentences = etk.process_ems(doc)
        for s in sentences:
            o.write(json.dumps(s.value))
            o.write('\n')
        count += 1

Exemple #6

0

Afficher le fichier

class ETKWorker(object):
    def __init__(self, master_config, em_paths, logger, worker_id,
                 project_name, kafka_input_args=None, kafka_output_args=None):
        self.logger = logger
        self.worker_id = worker_id
        self.check_interval = 1000
        self.exit_sign = False

        try:
            kg_schema = KGSchema(master_config)
            self.etk_ins = ETK(kg_schema, em_paths, logger=logger)
        except Exception as e:
            logger.exception('ETK initialization failed')
            raise e

        # kafka input
        self.kafka_input_server = config['input_server']
        self.kafka_input_session_timeout = config['input_session_timeout']
        self.kafka_input_group_id = config['input_group_id']
        self.kafka_input_topic = '{project_name}_in'.format(project_name=project_name)
        self.kafka_input_args = dict() if kafka_input_args is None else kafka_input_args
        self.kafka_consumer = KafkaConsumer(
            bootstrap_servers=self.kafka_input_server,
            group_id=self.kafka_input_group_id,
            consumer_timeout_ms=self.check_interval,
            value_deserializer=lambda v: json.loads(v.decode('utf-8')),
            **self.kafka_input_args
        )
        self.kafka_consumer.subscribe([self.kafka_input_topic])

        # kafka output
        self.kafka_output_server = config['output_server']
        self.kafka_output_topic = '{project_name}_out'.format(project_name=project_name)
        self.kafka_output_args = dict() if kafka_output_args is None else kafka_output_args
        self.kafka_producer = KafkaProducer(
            bootstrap_servers=self.kafka_output_server,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            **self.kafka_output_args
        )

        self.timeout_count = self.kafka_input_session_timeout / self.check_interval
        self.current_timeout_count = 0

    def process(self):
        # prev_doc_sent_time = None

        while not self.exit_sign:
            # high level api handles batching
            # will exit once timeout
            try:
                for msg in self.kafka_consumer:
                    # force to commit, block till getting response
                    self.kafka_consumer.commit()
                    # get message, clear timeout count
                    self.current_timeout_count = 0

                    cdr = msg.value
                    # TODO better way to add execution profile
                    # cdr['@execution_profile'] = {'@worker_id': self.worker_id}
                    # doc_arrived_time = time.time()
                    # cdr['@execution_profile']['@doc_arrived_time'] = \
                    #     datetime.utcfromtimestamp(doc_arrived_time).isoformat()
                    # cdr['@execution_profile']['@doc_wait_time'] = \
                    #     0.0 if not prev_doc_sent_time \
                    #         else float(doc_arrived_time - prev_doc_sent_time)
                    # cdr['@execution_profile']['@doc_length'] = len(json.dumps(cdr))

                    if 'doc_id' not in cdr or len(cdr['doc_id']) == 0:
                        self.logger.error('invalid cdr: unknown doc_id')
                        continue

                    self.logger.info('processing %s' % cdr['doc_id'])
                    try:
                        # start_run_core_time = time.time()
                        # run etk module

                        doc = self.etk_ins.create_document(cdr, url=cdr['url'], doc_id=cdr['doc_id'])
                        # process_ems returns a list of Documents
                        results = self.etk_ins.process_ems(doc)
                        for result in results:
                            cdr_result = result.cdr_document

                            # indexing
                            # TODO
                            indexed_cdr = index_knowledge_graph_fields(cdr_result)
                            if not indexed_cdr:
                                logger.error('indexing in sandpaper failed')
                                continue
                            # cdr = indexed_cdr

                        # cdr['@execution_profile']['@run_core_time'] = \
                        #     float(time.time() - start_run_core_time)
                        # doc_sent_time = time.time()
                        # cdr['@execution_profile']['@doc_sent_time'] = \
                        #     datetime.utcfromtimestamp(doc_sent_time).isoformat()
                        # prev_doc_sent_time = doc_sent_time
                        # cdr['@execution_profile']['@doc_processed_time'] = \
                        #     float(doc_sent_time - doc_arrived_time)

                            # output result
                            r = self.kafka_producer.send(self.kafka_output_topic, indexed_cdr)
                            r.get(timeout=60)  # wait till sent

                            self.logger.info('{} done'.format(indexed_cdr['doc_id']))

                    except Exception as e:
                        self.logger.exception('failed at %s' % cdr['doc_id'])

            except ValueError as e:
                # I/O operation on closed epoll fd
                self.logger.info('consumer closed')
                self.exit_sign = True

            except StopIteration as e:
                # timeout
                self.current_timeout_count += 1
                if self.current_timeout_count >= self.timeout_count:
                    self.exit_sign = True

            except CommitFailedError as e:
                self.exit_sign = True

                # https://github.com/dpkp/kafka-python/blob/535d8f6a85969c4e07de0bc81e14513c677995be/kafka/errors.py#L65
                # if this worker is dead, restart and reattach to the group
                g_restart_worker = True

    def __del__(self):

        self.logger.info('ETK worker {} is exiting...'.format(self.worker_id))

        try:
            self.kafka_consumer.close()
        except:
            pass
        try:
            self.kafka_producer.close()
        except:
            pass

Exemple #7

0

Afficher le fichier

        # Record the country of this actor
        doc.kg.add_value("country", json_path="$.Side")

        # Add a title to the actor document
        doc.kg.add_value("title", json_path="$.Side")

        # Return an empty list because we didn't create new documents
        return list()

# The main is for testing, and is not used in the DIG pipeline
if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the Excel sheet
    cp = CsvProcessor(etk=etk, heading_row=1)

    with open("ucdp.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'):
            # Each row produces a document, which we sent to ETK.
            # Note that each invocation of process_ems will also process any new documents created while
            # processing each doc
            for result in etk.process_ems(doc):
                print(result.cdr_document["knowledge_graph"])
                f.write(json.dumps(result.cdr_document) + "\n")

Exemple #8

0

Afficher le fichier

Fichier : em_acled.py Projet : xkgoodbest/etk

            # for segment in doc.select_segments(jsonpath='$.notes'):
            #     doc.kg.add_value("description", segment.value)
            doc.kg.add_value("description", json_path='$.notes')

    def document_selector(self, doc) -> bool:
        """
        Boolean function for selecting document
        Args:
            doc: Document

        Returns:

        """
        return DefaultDocumentSelector().select_document(doc)


if __name__ == "__main__":

    kg_schema = KGSchema(json.load(open('master_config.json')))
    etk = ETK(modules=AcledModule, kg_schema=kg_schema)
    cp = CsvProcessor(etk=etk, heading_row=1)

    data_set = 'test_data_set_csv'
    docs = cp.tabular_extractor(filename="acled_raw_data.csv",
                                dataset='acled',
                                doc_id_field="data_id")

    results = etk.process_ems(docs[0])

    print(json.dumps(results[0].value, indent=2))

Exemple #9

0

Afficher le fichier

Fichier : provenance_origin_extraction.py Projet : xkgoodbest/etk

            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep, Anika and others."
        }, {
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }

    #provenance use example case for fetching origins

    kg_schema = KGSchema(json.load(open("master_config.json", "r")))
    etk = ETK(kg_schema=kg_schema, modules=ProvenanceOriginExtractionETKModule)
    doc = etk.create_document(sample_input)

    doc_ = etk.process_ems(doc)
    print(json.dumps(doc.kg.value, indent=2))
    provenanceAPI = ProvenanceAPI(doc)
    #print(json.dumps(doc.value, indent=2))

    #Use case/Example 1
    origins = provenanceAPI.get_origins("developer", 1)

    print("Use case/Example 1: ")
    for origin in origins:
        print("start char: " + str(origin.start_char))
        print("end char: " + str(origin.end_char))
        print("jsonPath char: " + str(origin.json_path))

    #Use case/Example 2
    origins = provenanceAPI.get_origins("developer")

Exemple #10

0

Afficher le fichier

    class TripleGenerator:
        """
        A class to maintain the status of the generator
        """
        def __init__(
            self,
            propFile: str,
            labelSet: str,
            aliasSet: str,
            descriptionSet: str,
            n: int,
            destFp: TextIO = sys.stdout,
        ):
            self.propTypes = self.__setPropTypes(propFile)
            self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets(
                labelSet, aliasSet, descriptionSet)
            # TODO handle standard output
            self.fp = destFp
            self.n = int(n)
            self.read = 0
            # serialize prfix
            kg_schema = KGSchema()
            kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
            self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
            self.doc = self.__setDoc()
            self.__serialize_prefix()

        def __setPropTypes(self, propFile: str):
            dataTypeMappings = {
                "item": Item,
                "time": TimeValue,
                "globe-coordinate": GlobeCoordinate,
                "quantity": QuantityValue,
                "monolingualtext": MonolingualText,
                "string": StringValue,
            }
            with open(propFile, "r") as fp:
                props = fp.readlines()
            __propTypes = {}
            for line in props[1:]:
                node1, _, node2 = line.split("\t")
                try:
                    __propTypes[node1] = dataTypeMappings[node2.strip()]
                except:
                    raise KGTKException(
                        "DataType {} of node {} is not supported.\n".format(
                            node2, node1))
            return __propTypes

        def __setSets(self, labelSet: str, aliasSet: str, descriptionSet: str):
            return (
                set(labelSet.split(",")),
                set(aliasSet.split(",")),
                set(descriptionSet.split(",")),
            )

        def __setDoc(self, doc_id: str = "http://isi.edu/default-ns/projects"):
            """
            reset the doc object and return it. Called at initialization and after outputting triples.
            """
            doc = self.etk.create_document({}, doc_id=doc_id)
            # bind prefixes
            doc.kg.bind("wikibase", "http://wikiba.se/ontology#")
            doc.kg.bind("wd", "http://www.wikidata.org/entity/")
            doc.kg.bind("wdt", "http://www.wikidata.org/prop/direct/")
            doc.kg.bind("wdtn",
                        "http://www.wikidata.org/prop/direct-normalized/")
            doc.kg.bind("wdno", "http://www.wikidata.org/prop/novalue/")
            doc.kg.bind("wds", "http://www.wikidata.org/entity/statement/")
            doc.kg.bind("wdv", "http://www.wikidata.org/value/")
            doc.kg.bind("wdref", "http://www.wikidata.org/reference/")
            doc.kg.bind("p", "http://www.wikidata.org/prop/")
            doc.kg.bind("pr", "http://www.wikidata.org/prop/reference/")
            doc.kg.bind("prv", "http://www.wikidata.org/prop/reference/value/")
            doc.kg.bind(
                "prn",
                "http://www.wikidata.org/prop/reference/value-normalized/")
            doc.kg.bind("ps", "http://www.wikidata.org/prop/statement/")
            doc.kg.bind("psv", "http://www.wikidata.org/prop/statement/value/")
            doc.kg.bind(
                "psn",
                "http://www.wikidata.org/prop/statement/value-normalized/")
            doc.kg.bind("pq", "http://www.wikidata.org/prop/qualifier/")
            doc.kg.bind("pqv", "http://www.wikidata.org/prop/qualifier/value/")
            doc.kg.bind(
                "pqn",
                "http://www.wikidata.org/prop/qualifier/value-normalized/")
            doc.kg.bind("skos", "http://www.w3.org/2004/02/skos/core#")
            doc.kg.bind("prov", "http://www.w3.org/ns/prov#")
            doc.kg.bind("schema", "http://schema.org/")
            return doc

        def genLabelTriple(self, node1: str, label: str, node2: str) -> bool:
            if node1 in self.propTypes:
                entity = WDProperty(node1.upper(), self.propTypes[node1])
            else:
                entity = WDItem(node1.upper())
            if "@" in node2:
                node2, lang = node2.split("@")
                entity.add_label(node2.replace('"', "").replace("'", ""),
                                 lang=lang)
            else:
                entity.add_label(node2.replace('"', "").replace("'", ""),
                                 lang="en")  # default
            self.doc.kg.add_subject(entity)
            return True

        def genDescriptionTriple(self, node1: str, label: str,
                                 node2: str) -> bool:
            if node1 in self.propTypes:
                entity = WDProperty(node1.upper(), self.propTypes[node1])
            else:
                entity = WDItem(node1.upper())
            if "@" in node2:
                node2, lang = node2.split("@")
                entity.add_description(node2.replace('"', "").replace("'", ""),
                                       lang=lang)
            else:
                entity.add_description(node2.replace('"', "").replace("'", ""),
                                       lang="en")  # default
            self.doc.kg.add_subject(entity)
            return True

        def genDescriptionTriple(self, node1: str, label: str,
                                 node2: str) -> bool:
            if node1 in self.propTypes:
                entity = WDProperty(node1.upper(), self.propTypes[node1])
            else:
                entity = WDItem(node1.upper())
            if "@" in node2:
                node2, lang = node2.split("@")
                entity.add_description(node2.replace('"', "").replace("'", ""),
                                       lang=lang)
            else:
                entity.add_description(node2.replace('"', "").replace("'", ""),
                                       lang="en")  # default
            self.doc.kg.add_subject(entity)
            return True

        def genAliasTriple(self, node1: str, label: str, node2: str) -> bool:
            if node1 in self.propTypes:
                entity = WDProperty(node1.upper(), self.propTypes[node1])
            else:
                entity = WDItem(node1.upper())

            if "@" in node2:
                node2, lang = node2.split("@")
                entity.add_alias(node2.replace('"', "").replace("'", ""),
                                 lang=lang)
            else:
                entity.add_alias(node2.replace('"', "").replace("'", ""),
                                 lang="en")  # default
            self.doc.kg.add_subject(entity)
            return True

        def genPropDeclarationTriple(self, node1: str, label: str,
                                     node2: str) -> bool:
            prop = WDProperty(node1.upper(), self.propTypes[node1])
            self.doc.kg.add_subject(prop)
            return True

        def genNormalTriple(self, node1: str, label: str, node2: str,
                            isPropEdge: bool) -> bool:
            """
            The normal triple's type is determined by 
            1. label's datatype in prop_types.tsv
            2. kgtk format convention of node2 field

            Update the self.STATEMENT
            """
            # determine the node type [property|item]
            if node1 in self.propTypes:
                entity = WDProperty(node1.upper(), self.propTypes[node1])
            else:
                entity = WDItem(node1.upper())
            # determine the edge type
            edgeType = self.propTypes[label]
            if edgeType == Item:
                OBJECT = Item(node2.upper())

            elif edgeType == TimeValue:
                # https://www.wikidata.org/wiki/Help:Dates
                # ^201301-01T00:00:00Z/11
                dateTimeString, precision = node2[1:].split("/")
                dateString, timeString = dateTimeString.split("T")
                OBJECT = TimeValue(
                    value=dateString,
                    calendar=Item("Q1985727"),
                    precision=precision,
                    time_zone=0,
                )

            elif edgeType == GlobeCoordinate:
                latitude, longitude = node2[1:].split("/")
                OBJECT = GlobeCoordinate(latitude,
                                         longitude,
                                         0.0001,
                                         globe=StringValue("Earth"))

            elif edgeType == QuantityValue:
                amount, unit = (re.compile("([\+|\-]?[0-9]+\.?[0-9]*)U([0-9]+)"
                                           ).match(node2).groups())
                OBJECT = QuantityValue(amount=float(amount), unit=Item(unit))

            elif edgeType == MonolingualText:
                try:
                    textString, lang = node2.split("@")
                    OBJECT = MonolingualText(textString, lang)
                except:
                    OBJECT = MonolingualText(textString, "en")
            else:
                # treat everything else as stringValue
                OBJECT = StringValue(node2)

            if isPropEdge:
                # edge: q1 p8 q2 e8
                # create brand new property edge and replace STATEMENT
                self.STATEMENT = entity.add_statement(label.upper(), OBJECT)
            else:
                # edge: e8 p9 ^2013-01-01T00:00:00Z/11
                # create qualifier edge on previous STATEMENT and return the updated STATEMENT
                self.STATEMENT.add_qualifier(label.upper(), OBJECT)
            self.doc.kg.add_subject(self.STATEMENT)
            return True

        def entryPoint(self, edge: str):
            """
            edge: "p8\tp1\t'hasFather'@en\te5\n", a line in the edges.tsv file
            generates a list of two, the first element is the determination of the edge type using corresponding edge type
            the second element is a bool indicating whether this is a valid property edge or qualifier edge.
            Call corresponding downstream functions
            """
            edgeList = edge.strip().split("\t")
            l = len(edgeList)
            if l == 4:
                # property statement edge
                # try to serialize when a new property statement is encountered.
                if self.read >= self.n:
                    self.serialize()
                isPropEdge = True
                [node1, label, node2, eID] = edgeList
                node1, label, node2, eID = (
                    node1.strip(),
                    label.strip(),
                    node2.strip(),
                    eID.strip(),
                )
                if eID == self.ID:
                    raise KGTKException(
                        "id {} of edge {} duplicates latest property statement id {}.\n"
                        .format(eID, edge, self.ID))
                    return
                else:
                    self.ID = eID
            elif l == 3:
                # qualifier edge or property declaration edge
                isPropEdge = False
                [node1, label, node2] = edgeList
                node1, label, node2 = node1.strip(), label.strip(
                ), node2.strip()
                if label != "type" and node1 != self.ID:
                    # 1. not a property declaration edge and
                    # 2. the current qualifier's node1 is not the latest property edge id, throw errors.
                    raise KGTKException(
                        "Node1 {} of qualifier edge {} doesn't agree with latest property edge id {}.\n"
                        .format(node1, edge, self.ID))
                    return
            else:
                raise KGTKException(
                    "Length {} of edge {} is not valid.\n".format(l, edge))
                return

            if label in self.labelSet:
                self.read += self.genLabelTriple(node1, label, node2)
            elif label in self.descriptionSet:
                self.read += self.genDescriptionTriple(node1, label, node2)
            elif label in self.aliasSet:
                self.read += self.genAliasTriple(node1, label, node2)
            elif label == "type":
                # special edge of prop declaration
                self.read += self.genPropDeclarationTriple(node1, label, node2)
            else:
                if label in self.propTypes:
                    self.read += self.genNormalTriple(node1, label, node2,
                                                      isPropEdge)
                else:
                    raise KGTKException(
                        "property {}'s type is unknown as in edge {}.\n".
                        format(label, edge))

        def serialize(self):
            """
            Seriealize the triples. Used a hack to avoid serializing the prefix again.
            """
            docs = self.etk.process_ems(self.doc)
            self.fp.write("\n\n".join(
                docs[0].kg.serialize("ttl").split("\n\n")[1:]))
            self.__reset()

        def __serialize_prefix(self):
            """
            This function should be called only once after the doc object is initialized.
            """
            docs = self.etk.process_ems(self.doc)
            self.fp.write(docs[0].kg.serialize("ttl").split("\n\n")[0] +
                          "\n\n")
            self.__reset()

        def __reset(self):
            self.ID = None
            self.STATEMENT = None
            self.read = 0
            self.doc = self.__setDoc()

        def finalize(self):
            return