Ejemplo n.º 1
0
def create_query_store(store):
    new_store = SPARQLStore(endpoint=store.endpoint,
                            default_query_method=POST,
                            returnFormat=JSON,
                            node_to_sparql=node_to_sparql)
    new_store._defaultReturnFormat = JSON
    new_store.setReturnFormat(JSON)
    return new_store
Ejemplo n.º 2
0
def query(form_params):
    namespaces = get_namespaces()
    sparql_store = SPARQLStore("https://yago-knowledge.org/sparql/query")
    query_string = prepare_query(form_params)
    result = sparql_store.query(query_string, initNs=namespaces)
    # for row in list(result):
    #     print(row)
    return result
    def _open_sparql_store(self, endpoint: str = config.FUSEKI_ENDPOINT):
        """
        Open RDFlib SPARQL store with query URL at `endpoint`.

        Args:
            endpoint (str, optional): Defaults to config.FUSEKI_ENDPOINT.
        """

        self.sparql_store = SPARQLStore(endpoint)
        self.sparql_store.open(endpoint)
Ejemplo n.º 4
0
 def __init__(self, *, server_base_url=None):
     """
     Construct a TWKS client.
     :param server_base_url: base URL of the server, excluding path e.g., http://localhost:8080"
     """
     if not server_base_url:
         server_base_url = "http://localhost:8080"
     self.__server_base_url = server_base_url
     assertions_sparql_query_endpoint = server_base_url + "/sparql/assertions"
     self.assertions_sparql_store = SPARQLStore(endpoint=assertions_sparql_query_endpoint, )
     # query_endpoint=assertions_sparql_query_endpoint)
     nanopublications_sparql_query_endpoint = server_base_url + "/sparql/nanopublications"
     self.nanopublications_sparql_store = SPARQLStore(endpoint=nanopublications_sparql_query_endpoint, )
Ejemplo n.º 5
0
 def sparql(self, query, sites=None):
     if sites is None:
         res = self._sparql_endpoint.query(query)
         return pd.DataFrame.from_records(
             list(res), columns=[str(c) for c in res.vars])
     dfs = []
     for site in sites:
         ep = SPARQLStore(f"{self._endpoint}/sparql?site={site}")
         res = ep.query(query)
         df = pd.DataFrame.from_records(list(res),
                                        columns=[str(c) for c in res.vars])
         df["site"] = site
         dfs.append(df)
     if len(dfs) == 0:
         return pd.DataFrame()
     return functools.reduce(lambda x, y: pd.concat([x, y], axis=0), dfs)
Ejemplo n.º 6
0
def main():

    store = SPARQLStore(SPARQL_ENDPOINT)
    g = ConjunctiveGraph(store=store)
    # g.bind("sg", "http://www.springernature.com/scigraph/ontologies/core/")

    # get a few articles
    q1 = g.query(ALL_ARTICLES_IDS_SAMPLE)
    for row in q1:
        print("Article URI:", str(row[0]))

    # extract more article info
    for row in q1:
        try:
            with time_limit(MAX_TIMEOUT):
                raw = g.query(ARTICLE_INFO_QUERY % str(row[0]))
                g1 = ConjunctiveGraph()
                g1.parse(data=raw.serialize())

                # create JSON-LD
                context = {
                    "@vocab": "http://elastic-index.scigraph.com/",
                    "@language": "en"
                }
                print(g1.serialize(format='json-ld', context=context,
                                   indent=4))
                print("======")
        except TimeoutException, msg:
            error = "Timed out!"
            print(error)
        except Exception, e:
            error = "Exception: %s" % e
            print(error)
Ejemplo n.º 7
0
 def set_graph(self):
     store = SPARQLStore(endpoint=self.__class__.SPARQL_QUERY_URI.format(
         self.sparql_hostname))
     self.sparql_graph = Graph(
         store=store,
         identifier=self.__class__.SPARQL_GRAPH_URI.format(
             self.sparql_hostname))
Ejemplo n.º 8
0
def show_entity_cluster_list(type_, repo):
    graph_uri = request.args.get('g', default=None)
    limit = request.args.get('limit', default=100, type=int)
    offset = request.args.get('offset', default=0, type=int)
    sortby = request.args.get('sortby', default='size')
    sparql = SPARQLStore(setting.endpoint + '/' + repo)
    model = Model(sparql, repo, graph_uri)
    if type_ == 'entity':
        return render_template('list.html',
                               url_prefix=url_prefix,
                               type_='entity',
                               repo=repo,
                               graph=graph_uri,
                               limit=limit,
                               offset=offset,
                               sortby=sortby,
                               clusters=model.get_cluster_list(
                                   types.Entity, limit, offset, sortby))
    elif type_ == 'event':
        return render_template('list.html',
                               url_prefix=url_prefix,
                               type_='event',
                               repo=repo,
                               graph=graph_uri,
                               limit=limit,
                               offset=offset,
                               sortby=sortby,
                               clusters=model.get_cluster_list(
                                   types.Events, limit, offset, sortby))
    else:
        abort(404)
Ejemplo n.º 9
0
    def test_counting_graph_and_store_queries(self):
        query = """
            SELECT ?s
            WHERE {
                ?s ?p ?o .
            }
            LIMIT 5
            """
        g = Graph("SPARQLStore")
        g.open(self.path)
        count = 0
        result = helper.query_with_retry(g, query)
        for _ in result:
            count += 1

        assert count == 5, "Graph(\"SPARQLStore\") didn't return 5 records"

        from rdflib.plugins.stores.sparqlstore import SPARQLStore
        st = SPARQLStore(query_endpoint=self.path)
        count = 0
        result = helper.query_with_retry(st, query)
        for _ in result:
            count += 1

        assert count == 5, "SPARQLStore() didn't return 5 records"
Ejemplo n.º 10
0
def show_columbia_cluster(repo, uri):
    graph_uri = request.args.get('g', default=None)
    uri = 'http://www.columbia.edu/AIDA/' + uri
    show_image = request.args.get('image', default=True)
    show_limit = request.args.get('limit', default=100)
    sparql = SPARQLStore(setting.endpoint + '/' + repo)
    model = Model(sparql, repo, graph_uri)
    return show_cluster(model, uri, show_image, show_limit)
Ejemplo n.º 11
0
def show_event_cluster(repo, uri):
    uri = 'http://www.isi.edu/gaia/events/' + uri
    graph_uri = request.args.get('g', default=None)
    show_image = request.args.get('image', default=True)
    show_limit = request.args.get('limit', default=100)
    sparql = SPARQLStore(setting.endpoint + '/' + repo)
    model = Model(sparql, repo, graph_uri)
    return show_cluster(model, uri, show_image, show_limit)
Ejemplo n.º 12
0
    def _graph(self):
        """
        Get the graph, creating a new one if necessary.

        """
        if self.__graph is None:
            store = SPARQLStore(endpoint='http://%s/sparql' %
                                (SPARQL_HOST_NAME))
            self.__graph = ConjunctiveGraph(store=store)
        return self.__graph
Ejemplo n.º 13
0
def hello_world(repo):
    graph_uri = request.args.get('g', '')
    sparql = SPARQLStore(setting.endpoint + '/' + repo)
    model = Model(sparql, repo, graph_uri)
    return render_template('clusters.html',
                           url_prefix=url_prefix,
                           repo=repo,
                           graph=graph_uri,
                           entities=model.get_cluster_list(types.Entity),
                           events=model.get_cluster_list(types.Events),
                           relations=model.get_cluster_list(types.Relation))
Ejemplo n.º 14
0
def test_20000_tsv():

    store = SPARQLStore(returnFormat='tsv')
    g = rdflib.Graph(store, identifier='http://nlg.orbit.ai/graphs/DataGraph')
    g.open('http://localhost:3030/ntb/query')

    start = time.time()

    res = list(g.query('SELECT ?s ?p ?o ?x WHERE { ?s ?p ?o } LIMIT 20000'))

    print 'tsv', time.time() - start
Ejemplo n.º 15
0
def show_entity_gt(repo):
    uri = request.args.get('e', default=None)
    graph_uri = request.args.get('g', default=None)
    sparql = SPARQLStore(setting.endpoint + '/' + repo)
    model = Model(sparql, repo, graph_uri)
    cluster = model.get_cluster(uri)
    return render_template('groundtruth.html',
                           url_prefix=url_prefix,
                           repo=repo,
                           graph=graph_uri,
                           cluster=cluster)
Ejemplo n.º 16
0
    def __init__(self,*args,**kw):
        super(TripleStoreResource,self).__init__(*args,**kw)
        self.attr(kw,'defns',default=None)
        self.attr(kw,'endpt',required=True)
        self.attr(kw,'verbose',default=False)

        self.attr(kw,'cachefile',default=False)
        self.attr(kw,'usecache',default=False)
        self.attr(kw,'cachemode',default='r')
        self.attr(kw,'prefetch',default=False)
        if not self._prefetch:
            self._usecache = False

        if self._defns:
            self._ns = rdflib.Namespace(self._defns)

        # from rdflib.query import ResultParser
        # from rdflib.plugin import register
        # register( 'text/plain', ResultParser, 'rdflib.plugins.sparql.results.xmlresults', 'XMLResultParser')
        # register( 'application/sparql-results+xml', ResultParser, 'rdflib.plugins.sparql.results.xmlresults', 'XMLResultParser')

        from rdflib.plugins.stores.sparqlstore import SPARQLStore
        store = SPARQLStore(self._endpt)
        store.method = 'POST'
        self._ts = rdflib.ConjunctiveGraph(store=store)

        self._cache = None

        if self._prefetch:
            self._cache = {}

        if self._usecache:
            self.opencache()

        if self._verbose:
            print("TripleStoreResource:endpt = %s"%(self._endpt), file=sys.stderr)
            print("TripleStoreResource:prefetch = %s"%(self._prefetch), file=sys.stderr)
            print("TripleStoreResource:usecache = %s"%(self._usecache), file=sys.stderr)
            print("TripleStoreResource:cachemode = %s"%(self._cachemode), file=sys.stderr)
Ejemplo n.º 17
0
    def test_counting_graph_and_store_queries(self):
        q = """
            SELECT ?s
            WHERE {
                ?s ?p ?o .
            }
            LIMIT 5
            """
        g = Graph("SPARQLStore")
        g.open(self.path)
        c = 0
        for r in g.query(q):
            c += 1

        assert c == 5, "Graph(\"SPARQLStore\") didn't return 5 records"

        from rdflib.plugins.stores.sparqlstore import SPARQLStore
        st = SPARQLStore(query_endpoint=self.path)
        c = 0
        for r in st.query(q):
            c += 1

        assert c == 5, "SPARQLStore() didn't return 5 records"
Ejemplo n.º 18
0
def load_data(data_url: str, old_graph: Optional[PPGraph] = None) -> PPGraph:
    """Create new PPGraph or add triples to the provided one.

    Args:
        data_url: path to RDF file or url address of SPARQL endpoint,
                    passing an url will invalidate old_graph
        old_graph: existing graph, will add triples to it

    Returns:
        Graph with triples loaded from data_url (lazy loaded in case of SPARQL endpoint)
    """
    if old_graph:
        graph = old_graph
    else:
        graph = PPGraph(ConjunctiveGraph())

    if isfile(data_url):
        L.info('Loading triples from file `%s`', data_url)
        data_format = guess_format(data_url)
        graph.parse(data_url, format=data_format)

    elif isdir(data_url):
        L.info('Loading triples from files in directory `%s`', data_url)
        for extension in TRIPLE_FILE_EXTENSIONS:
            triples_files = glob(f'{data_url}/*.{extension}')
            if len(triples_files) > 0:
                L.info('Found %d `.%s` files', len(triples_files), extension)

            for i, triples_file in enumerate(triples_files):
                data_format = guess_format(triples_file)
                L.debug('%d / %d (`%s`), data format: %s', i,
                        len(triples_files), triples_file, data_format)
                graph.parse(triples_file, format=data_format)

    else:
        L.info('Using remote graph from SPARQL endpoint `%s`', data_url)
        graph = PPGraph(SPARQLStore(data_url))

        # early fail
        try:
            graph.query('''SELECT DISTINCT ?s 
                   WHERE { 
                      ?s rdf:type foaf:Person
                   } LIMIT 1''')
        except Exception as e:
            L.error("Can't load data from remote endpoint")
            raise e

    return graph
Ejemplo n.º 19
0
def fetch(endpoint, timeout=0):
    store = SPARQLStore(endpoint)
    ds = Dataset(store)
    for rs_name, rs_uri in get_rule_sets(endpoint + rs_table_page):
        # TODO: maybe do not discrad but try to merge? no.
        if rs_uri not in rule_sets:
            # TODO: handle possible query error?
            gr = ds.get_context(rs_uri)
            try:
                rs_triples = gr.query(q)
                yield rs_name, rs_uri, rs_triples
                time.sleep(timeout)
            except:
                print('error with', rs_uri)
                other_rs.append(rs_uri)
Ejemplo n.º 20
0
class RemoteGraph(SparqlQueryable):
    """
    RemoteGraph is used for accessing remote SPARQL endpoints.
    """
    def __init__(self, *, endpoint: str):
        self.graph = SPARQLStore(endpoint)

    def query(self, *, sparql: str) -> Result:
        """
        Query the remote graph using the API endpoint.

        :param sparql: A string containing valid SPARQL to query the graph.
        :return: A Result containing the result from calling the SPARQL query.
        """
        try:
            result = self.graph.query(sparql)
        except ResultException:
            # SPARQLStore raises an exception when no result is found
            result = Graph()
        return result
Ejemplo n.º 21
0
class NanoPubTripleStore(object):

    RSA = Namespace(ns_dict['RSA'])
    HG19 = Namespace(ns_dict['HG19'])
    NP = Namespace(ns_dict['NP'])

    def __init__(self, endpoint):
        self.store = SPARQLStore(endpoint)
        self.dataset = Dataset()

    def _get_resources_by_context(self, context):
        g = self.dataset.graph(context)
        results = self.store.query("select ?s ?p ?o where {GRAPH <%s> {?s ?p ?o}}" % context)
        for s, p, o in results:
            self.dataset.add_quad((s, p, o, g))

    def get_nanopub(self, base):
        self._get_resources_by_context(base)
        self._get_resources_by_context(base + '#assertion')
        self._get_resources_by_context(base + '#publicationInfo')
        self._get_resources_by_context(base + '#provenance')
        bind_namespaces(self.dataset, base)
        return self.dataset.serialize(base=base, format='trig')
Ejemplo n.º 22
0
 def __init__(self, grab, task, graph, spider=None):
     Parser.__init__(self, grab, task, graph, spider=spider)
     self.dbpedia = Graph(SPARQLStore(config.sparqlstore['dbpedia_url'],
                                      context_aware=False), namespace_manager=self.graph.namespace_manager)
Ejemplo n.º 23
0
def sparql_store():
    return SPARQLStore(endpoint=getattr(settings, 'SPARQL_QUERY_ENDPOINT'))
class Disambiguator(Classifier):
    """
    Implementation of a classifier for finding sameAs links between items in the Heritage Connector and items on Wikidata.
    TODO: link to documentation on exactly how this works.

    Attributes:
        table_name (str): `skos:hasTopConcept` value to use for disambiguator. This should
            have been set to refer to its original data source when importing data to the graph.
        random_state (int, optional): random state for all methods involving randomness. Defaults to 42.
        TODO: tune these decision tree params automatically when training the classifier.
        max_depth (int, optional): max depth of the decision tree classifier.
        class_weight (str, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to "balanced".
        min_samples_split (int, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to 2.
        min_samples_leaf (int, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to 5.
        max_features (int, optional): See sklearn.tree.DecisionTreeClassifier docs. Defaults to None.
        bidirectional_distance (bool, optional): whether to include Wikidata types not in the immediate
            class tree when calculating similarity between entity types. Defaults to False, i.e. only considers
            types to have a similarity greater than 0 if they are in the same instance of/subclass of Wikidata
            hierarchy.
        enforce_entities_have_type (bool, optional): only entities with values for `rdf:type` will be retrieved
            from the heritage connector graph. Defaults to True.
    """
    def __init__(
        self,
        table_name: str,
        random_state=42,
        max_depth=5,
        class_weight="balanced",
        min_samples_split=2,
        min_samples_leaf=5,
        max_features=None,
        bidirectional_distance=False,
        enforce_entities_have_type=True,
        extra_sparql_lines: str = "",
    ):
        super().__init__()

        self.table_name = table_name.upper()
        self.table_mapping = field_mapping.mapping[self.table_name]
        self.enforce_entities_have_type = enforce_entities_have_type

        self.clf = DecisionTreeClassifier(
            random_state=random_state,
            max_depth=max_depth,
            class_weight=class_weight,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
        )

        # whether to use an entity distance measure that can change direction
        self.bidirectional_distance = bidirectional_distance

        # in-memory caching for entity similarities, prefilled with case for where there is no type specified
        self.entity_distance_cache = {hash((None, None)): 0}

        if extra_sparql_lines:
            if not isinstance(extra_sparql_lines, str):
                raise ValueError(
                    f"Argument `extra_sparql_lines` must be a string. Type {type(extra_sparql_lines)} passed."
                )
            elif extra_sparql_lines[-1] != ".":
                raise ValueError(
                    f"Argument `extra_sparql_lines` must end in a full-stop. Value given was {extra_sparql_lines}"
                )

        self.extra_sparql_lines = extra_sparql_lines

    def fit(self, X: np.ndarray, y: np.ndarray):
        self.clf = self.clf.fit(X, y)

        return self

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
        Returns probabilities for the positive class

        Args:
            X (np.ndarray)

        Returns:
            np.ndarray: a value for each row of X
        """
        return self.clf.predict_proba(X)[:, 1]

    def predict(self, X: np.ndarray, threshold=0.5) -> np.ndarray:
        """
        Returns predictions for the positive class at a threshold.

        Args:
            X (np.ndarray)
            threshold (float, optional): Defaults to 0.5.

        Returns:
            np.ndarray: boolean values
        """
        pred_proba = self.predict_proba(X)

        return pred_proba >= threshold

    def get_predictions_table(self,
                              X: np.ndarray,
                              pairs: pd.DataFrame,
                              threshold=0.5) -> pd.DataFrame:
        """
        Returns a `pairs` dataframe with predictions and probabilities (y_pred, y_pred_proba) made by the classifier.

        Args:
            X (np.ndarray)
            pairs (pd.DataFrame): with columns internal_id, wikidata_id. returned by self.build_training_data
            threshold (float, optional): Defaults to 0.5.

        Returns:
            pd.DataFrame: with columns internal_id, wikidata_id, y_pred, y_pred_proba
        """

        pairs_new = pairs.copy()

        y_pred_proba = self.predict_proba(X)
        pairs_new["y_pred_proba"] = y_pred_proba
        pairs_new["y_pred"] = y_pred_proba >= threshold

        return pairs_new

    def get_top_ranked_pairs(self,
                             predictions_table: pd.DataFrame) -> pd.DataFrame:
        """
        Returns a dataframe of highest ranked Wikidata candidate for each internal record based on the classifier output.
        Any predictions below the threshold aren't counted. If there are multiple Wikidata candidates with the same
        predicted probability, all candidates with the maximum probability are returned.

        Args:
            predictions_table: returned by `get_predictions_table`

        Returns:
            pd.DataFrame: with same columns as predictions_table (internal_id, wikidata_id, y_pred, y_pred_proba)
        """

        pairs = predictions_table.copy()

        pairs_true = pairs[pairs["y_pred"] == True]  # noqa: E712

        pairs_true_filtered = pd.DataFrame()

        for _id in pairs_true["internal_id"].unique().tolist():
            tempdf = pairs_true[pairs_true["internal_id"] == _id]
            max_proba = tempdf["y_pred_proba"].max()

            pairs_true_filtered = pairs_true_filtered.append(
                tempdf[tempdf["y_pred_proba"] == max_proba])

        return pairs_true_filtered

    def score(self,
              X: np.ndarray,
              y: np.ndarray,
              threshold: float = 0.5,
              output_dict=False) -> float:
        """
        Returns balanced accuracy, precision and recall for given test data and labels.

        Args:
            X (np.ndarray): data to return score for.
            y (np.ndarray): True labels.
            threshold (np.ndarray): threshold to use for classification.
            output_dict (bool, optional): whether to output a dictionary with the results. Defaults to False,
                where the results will be printed.

        Returns:
            float: score
        """

        y_pred = self.predict(X, threshold)

        results = {
            "balanced accuracy score": balanced_accuracy_score(y, y_pred),
            "precision score": precision_score(y, y_pred),
            "recall score": recall_score(y, y_pred),
        }

        if output_dict:
            return results
        else:
            return "\n".join([f"{k}: {v}" for k, v in results.items()])

    def print_tree(self, feature_names: list = None):
        """
        Print textual representation of the decision tree.

        Args:
            feature_names (list, optional): List of feature names to use. Defaults to None.
        """

        print(export_text(self.clf, feature_names=feature_names))

    def save_classifier_to_disk(self, path: str):
        """
        Pickle classifier to disk.

        Args:
            path (str): path to pickle to
        """

        # TODO: should maybe raise a warning if model hasn't been trained,
        # but not sure how to do this without testing predict (which needs X, or
        # at least the required dimensions of X)

        dump(self.clf, path)

    def load_classifier_from_disk(self, path: str):
        """
        Load pickled classifier from disk

        Args:
            path (str): path of pickled classifier
        """

        # TODO: maybe there should be a warning if overwriting a trained model.
        # See todo above.

        self.clf = load(path)

    def save_training_data_to_folder(
        self,
        path: str,
        limit: int = None,
        page_size=100,
        search_limit=20,
    ):
        """
        Make training data from the labelled records in the Heritage Connector and save it to a folder. The folder will contain:
            - X.npy: numpy array X
            - y.npy: numpy array y
            - pids.txt: newline separated list of column labels of X (properties used)
            - ids.txt: tab-separated CSV (tsv) of internal and external ID pairs (rows of X)

        These can be loaded from the folder using `heritageconnector.disambiguation.helpers.load_training_data`.

        Args:
            path (str): path of folder to save files to
            limit (int, optional): Optionally limit the number of records processed. Defaults to None.
            page_size (int, optional): Batch size. Defaults to 100.
            search_limit (int, optional): Number of Wikidata candidates to process per SMG record, one of which
                is the correct match. Defaults to 20.
        """

        if not os.path.exists(path):
            errors.raise_file_not_found_error(path, "folder")

        X, y, pid_labels, id_pairs = self.build_training_data(
            True,
            page_size=page_size,
            limit=limit,
            search_limit=search_limit,
        )

        np.save(os.path.join(path, "X.npy"), X)
        np.save(os.path.join(path, "y.npy"), y)

        with open(os.path.join(path, "pids.txt"), "w") as f:
            f.write("\n".join(pid_labels))

        with open(os.path.join(path, "ids.txt"), "w") as f:
            wr = csv.writer(f, delimiter="\t")
            wr.writerows(id_pairs)

    def save_test_data_to_folder(
        self,
        path: str,
        limit: int = None,
        page_size=100,
        search_limit=20,
    ):
        """
        Make test data from the unlabelled records in the Heritage Connector and save it to a folder. The folder will contain:
            - X.npy: numpy array X
            - pids.txt: newline separated list of column labels of X (properties used)
            - ids.txt: tab-separated CSV (tsv) of internal and external ID pairs (rows of X)

        These can be loaded from the folder using `heritageconnector.disambiguation.helpers.load_training_data`.

        Args:
            path (str): path of folder to save files to
            limit (int, optional): Optionally limit the number of records processed. Defaults to None.
            page_size (int, optional): Batch size. Defaults to 100.
            search_limit (int, optional): Number of Wikidata candidates to process per SMG record, one of which
                is the correct match. Defaults to 20.
        """

        if not os.path.exists(path):
            errors.raise_file_not_found_error(path, "folder")

        X, pid_labels, id_pairs = self.build_training_data(
            False,
            page_size=page_size,
            limit=limit,
            search_limit=search_limit,
        )

        np.save(os.path.join(path, "X.npy"), X)

        with open(os.path.join(path, "pids.txt"), "w") as f:
            f.write("\n".join(pid_labels))

        with open(os.path.join(path, "ids.txt"), "w") as f:
            wr = csv.writer(f, delimiter="\t")
            wr.writerows(id_pairs)

    def _process_wikidata_results(
            self, wikidata_results: pd.DataFrame) -> pd.DataFrame:
        """
        - fill empty firstname (P735) and lastname (P734) fields by taking the first and last words of the label field
        - convert any date-like values to positive or negative integers
        - add label column combining itemLabel and altLabel lists
        - join P31 and P279 columns
        """
        firstname_from_label = lambda l: l.split(" ")[0]
        lastname_from_label = lambda l: l.split(" ")[-1]

        # firstname, lastname
        if ("P735Label" in wikidata_results.columns
                and "P734Label" in wikidata_results.columns):
            for idx, row in wikidata_results.iterrows():
                wikidata_results.loc[idx, "P735Label"] = (firstname_from_label(
                    row["label"]) if not row["P735Label"] else
                                                          row["P735Label"])
                wikidata_results.loc[idx, "P734Label"] = (lastname_from_label(
                    row["label"]) if not row["P734Label"] else
                                                          row["P734Label"])

        # combine labels and aliases into one list: label
        wikidata_results["label"] = wikidata_results["label"].apply(
            lambda i: [i] if isinstance(i, str) else i)
        wikidata_results["aliases"] = wikidata_results["aliases"].apply(
            lambda i: [i] if isinstance(i, str) else i)
        wikidata_results["label"] = (wikidata_results["label"] +
                                     wikidata_results["aliases"])

        # join P31 and P279 columns
        wikidata_results[["P31", "P279"]] = wikidata_results[[
            "P31", "P279"
        ]].applymap(lambda i: [i] if not isinstance(i, list) else i)
        wikidata_results["P31_and_P279"] = (
            wikidata_results["P31"] +
            wikidata_results["P279"]).apply(lambda i: list(set(i)))
        # ensure that empty strings don't exist in the same list as valid QIDs
        # [""] to ""; ["Q1234", ""] to ["Q1234"]
        wikidata_results["P31_and_P279"] = wikidata_results[
            "P31_and_P279"].apply(lambda i: "" if i == [""] else i)
        wikidata_results["P31_and_P279"] = wikidata_results[
            "P31_and_P279"].apply(lambda i: [x for x in i if x != ""]
                                  if (len(i) > 1) else i)

        return wikidata_results

    def _get_geographic_properties(self, pids: List[str]) -> List[str]:
        """
        Filter list of properties to ones which are geographic properties. Used so
        they can be compared using a separate similarity function.

        Args:
            pids (list): Wikidata properties

        Returns:
            list: geographic properties only
        """

        # Q18615777 is 'Wikidata property to indicate a location'
        return filter_qids_in_class_tree(pids,
                                         "Q18615777",
                                         include_instanceof=True)

    def _get_labelled_records_from_elasticsearch(self, limit: int = None):
        """
        Get labelled records (with sameAs) from Elasticsearch for training.

        Args:
            limit (int, optional): Defaults to None.

        """

        query = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "wildcard": {
                                "graph.@owl:sameAs.@id": "*"
                            }
                        },
                        {
                            "term": {
                                "type.keyword": self.table_name.upper()
                            }
                        },
                    ]
                }
            }
        }
        # set 'scroll' timeout to longer than default here to deal with large times between subsequent ES requests
        search_res = helpers.scan(es,
                                  query=query,
                                  index=config.ELASTIC_SEARCH_INDEX,
                                  size=500,
                                  scroll="30m")
        if limit:
            search_res = islice(search_res, limit)

        return search_res

    def _get_unlabelled_records_from_elasticsearch(self, limit: int = None):
        """
        Get unlabelled records (without sameAs) from Elasticsearch for inference.

        Args:
            limit (int, optional): Defaults to None.
        """

        query = {
            "query": {
                "bool": {
                    "must": {
                        "term": {
                            "type.keyword": self.table_name.upper()
                        }
                    },
                    "must_not": {
                        "exists": {
                            "field": "graph.@owl:sameAs.@id"
                        }
                    },
                }
            }
        }

        search_res = helpers.scan(es,
                                  query=query,
                                  index=config.ELASTIC_SEARCH_INDEX,
                                  size=500,
                                  scroll="30m")
        if limit:
            search_res = islice(search_res, limit)

        return search_res

    def _get_type_constraint(self) -> str:
        """For _get_labelled_records_from_sparql_store/_get_unlabelled_records_from_sparql_store"""

        if self.enforce_entities_have_type:
            return "?item rdf:type ?type."
        else:
            return ""

    def _get_labelled_records_from_sparql_store(self,
                                                limit: int = None
                                                ) -> Iterable[dict]:
        """
        Get all records with an owl:sameAs value (URIs and labels) from the Fuseki instance.

        Args:
            limit (int, optional): Defaults to None.

        Returns:
            Generator of dicts. Each dict has the form {"id": __, "label": ___}
        """

        query = f"""SELECT DISTINCT ?item ?itemLabel WHERE {{
            ?item owl:sameAs ?object.
            ?item rdfs:label ?itemLabel.
            {self._get_type_constraint()}
            {self.extra_sparql_lines}
            ?item skos:hasTopConcept '{self.table_name}'.
        }}"""

        if limit is not None:
            query = query + f"LIMIT {limit}"

        res = get_sparql_results(config.FUSEKI_ENDPOINT,
                                 query)["results"]["bindings"]

        return ({
            "id": item["item"]["value"],
            "label": item["itemLabel"]["value"]
        } for item in res)

    def _get_unlabelled_records_from_sparql_store(self,
                                                  limit: int = None
                                                  ) -> Iterable[dict]:
        """
        Get all records without an owl:sameAs value (URIs and labels) from the Fuseki instance.

        Args:
            limit (int, optional): Defaults to None.

        Returns:
            Generator of dicts. Each dict has the form {"id": __, "label": ___}
        """

        query = f"""SELECT DISTINCT ?item ?itemLabel WHERE {{
            FILTER NOT EXISTS {{?item owl:sameAs ?object}}.
            ?item rdfs:label ?itemLabel.
            {self._get_type_constraint()}
            {self.extra_sparql_lines}
            ?item skos:hasTopConcept '{self.table_name}'.
        }}"""

        if limit is not None:
            query = query + f"LIMIT {limit}"

        res = get_sparql_results(config.FUSEKI_ENDPOINT,
                                 query)["results"]["bindings"]

        return ({
            "id": item["item"]["value"],
            "label": item["itemLabel"]["value"]
        } for item in res)

    def _get_predicates(
        self,
        predicates_ignore: List[str] = [
            RDFS.label,
            OWL.sameAs,
            SKOS.hasTopConcept,
            FOAF.title,
        ],
    ) -> List[str]:
        """
        Get a unique list of predicates for the table. These will form the columns of X.

        Args:
            predicates_ignore (List[str]): predicates to ignore

        Returns:
            list of URLs for each predicate, excluding those in `predicates_ignore`
        """

        # TODO: remove this when using pydantic as it will coerce rdflib.term.URIRef to string
        predicates_ignore = [str(i) for i in predicates_ignore]

        query = f"""
        SELECT DISTINCT ?predicate
        WHERE {{
        ?subject <http://www.w3.org/2004/02/skos/core#hasTopConcept> '{self.table_name}'.
        ?subject ?predicate ?object.
        }}"""

        res = get_sparql_results(config.FUSEKI_ENDPOINT,
                                 query)["results"]["bindings"]

        if len(res) > 0:
            return [
                i["predicate"]["value"] for i in res
                if i["predicate"]["value"] not in predicates_ignore
            ]

        else:
            return []

    def _open_sparql_store(self, endpoint: str = config.FUSEKI_ENDPOINT):
        """
        Open RDFlib SPARQL store with query URL at `endpoint`.

        Args:
            endpoint (str, optional): Defaults to config.FUSEKI_ENDPOINT.
        """

        self.sparql_store = SPARQLStore(endpoint)
        self.sparql_store.open(endpoint)

    def _get_triples_from_store(
        self, spo: tuple = (None, None, None)) -> Iterable[tuple]:
        """
        Get triples with the mask (subject, predicate, object). Returns generator of tuples, where
        each tuple is a triple (ignores graph names).

        By default the SPARQL store is at the endpoint specified by FUSEKI_ENDPOINT in config. If you want
        to change this, call `self._open_sparql_store(endpoint='http://my_endpoint')` first.
        """
        if not hasattr(self, "sparqlstore"):
            self._open_sparql_store()

        return self.sparql_store.triples(spo)

    def _add_instanceof_distances_to_inmemory_cache(
            self, batch_instanceof_comparisons):
        """
        Adds instanceof distances for a batch to the in-memory/in-class-instance cache.
        """

        batch_instanceof_comparisons_unique = list(
            set(batch_instanceof_comparisons))

        logger.debug("Finding distances between entities...")
        for ent_1, ent_2 in tqdm(batch_instanceof_comparisons_unique):
            if (ent_1, ent_2) != (None, None):
                if isinstance(ent_2, list):
                    ent_set = {ent_1, tuple(ent_2)}
                else:
                    ent_set = {ent_1, ent_2}

                if hash((ent_1, ent_2)) not in self.entity_distance_cache:
                    self.entity_distance_cache[hash(
                        (ent_1,
                         ent_2))] = get_distance_between_entities_multiple(
                             ent_set,
                             bidirectional=self.bidirectional_distance,
                             reciprocal=True,
                         )

    def _to_tuple(self, val):
        """Convert lists to tuples, but leave values that aren't lists as they are."""
        return tuple(val) if isinstance(val, list) else val

    def _replace_internal_id_with_sameas_or_label(
        self, internal_url: rdflib.URIRef
    ) -> Union[rdflib.Literal, rdflib.URIRef, List[rdflib.Literal],
               List[rdflib.URIRef]]:
        """
        Replaces internal URL with Wikidata sameAs link (if exists) or label, in that order of preference.
        If neither exist, returns an empty string.
        """

        sameas_links = [
            i[0][-1] for i in self._get_triples_from_store((internal_url,
                                                            OWL.sameAs, None))
        ]
        item_labels = [
            i[0][-1] for i in self._get_triples_from_store((internal_url,
                                                            RDFS.label, None))
        ]

        if len(sameas_links) > 0:
            return sameas_links
        elif len(item_labels) > 0:
            # an item can only have one rdfs.label
            return item_labels[0]
        else:
            return ""

    def build_training_data(
        self,
        train: bool,
        page_size: int = 100,
        limit: int = None,
        search_limit=20,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get training arrays X, y from all the records in the Heritage Connector index with an existing sameAs
        link to Wikidata.

        Args:
            train (str): whether to build training data (True) or data for inference (False). If True a y vector
                is returned, otherwise one isn't.
            page_size (int, optional): the number of records to fetch from Wikidata per iteration. Larger numbers
                will speed up the process but may cause the SPARQL query to time out. Defaults to 10.
                (TODO: set better default)
            limit (int, optional): set a limit on the number of records to use for training (useful for testing).
                Defaults to None.
            search_limit (int, optional): number of search results to retrieve from the Wikidata dump per record.
                Defaults to 20.

        Returns:
            Tuple[np.ndarray, np.ndarray]: X, y
        """

        predicates = self._get_predicates()
        predicate_pid_mapping = get_wikidata_equivalents_for_properties(
            predicates)
        pids_ignore = (config.PIDS_IGNORE).split(" ")
        pids_categorical = (config.PIDS_CATEGORICAL).split(" ")

        # remove instanceof (P31) and add to end, as the type distance calculations are appended to X last
        predicate_pid_mapping = {
            k: url_to_pid(v)
            for k, v in predicate_pid_mapping.items()
            if v is not None and url_to_pid(v) not in pids_ignore + ["P31"]
        }
        #  TODO: add P279 into here then combine P13 with P279 to form item_instanceof
        pids = list(predicate_pid_mapping.values()) + ["P31", "P279"]
        predicate_pid_mapping.update({RDFS.label: "label"})

        pids_geographical = self._get_geographic_properties(pids)

        X_list = []
        if train:
            y_list = []
        ent_similarity_list = []
        id_pair_list = []

        # get records to process from Elasticsearch
        search = es_text_search(index=config.ELASTIC_SEARCH_WIKI_INDEX)

        if train:
            search_res = self._get_labelled_records_from_sparql_store(limit)
        else:
            search_res = self._get_unlabelled_records_from_sparql_store(limit)

        search_res_paginated = paginate_generator(search_res, page_size)

        total = None if limit is None else math.ceil(limit / page_size)

        # for each record, get Wikidata results and create X: feature matrix and y: boolean vector (correct/incorrect match)
        for item_list in tqdm(search_res_paginated, total=total):
            id_qid_mapping = dict()
            qid_instanceof_mapping = dict()
            batch_instanceof_comparisons = []

            logger.debug("Running search")
            start = time.time()
            for item in item_list:
                # text search for Wikidata matches
                qids, qid_instanceof_temp = search.run_search(
                    item["label"],
                    limit=search_limit,
                    include_aliases=True,
                    return_instanceof=True,
                )
                id_qid_mapping[item["id"]] = qids
                qid_instanceof_mapping.update(qid_instanceof_temp)

            end = time.time()
            logger.debug(f"...search complete in {end-start}s")

            # get Wikidata property values for the batch
            logger.debug("Getting wikidata fields")
            start = time.time()
            wikidata_results_df = get_wikidata_fields(
                pids=pids, id_qid_mapping=id_qid_mapping)
            end = time.time()
            logger.debug(f"...retrieved in {end-start}s")

            wikidata_results_df = self._process_wikidata_results(
                wikidata_results_df)

            logger.debug("Calculating field similarities for batch..")
            # create X array for each record
            for item in item_list:
                # we get all the triples for the item here (rather than each triple in the for loop below)
                # to reduce the load on the SPARQL DB
                try:
                    item_triples = list(
                        self._get_triples_from_store(
                            (URIRef(item["id"]), None, None)))

                except:  # noqa: E722
                    # sparql store has crashed
                    sleep_time = 120
                    logger.debug(
                        f"get_triples query failed for item {item['id']}. Retrying in {sleep_time} seconds"
                    )
                    time.sleep(sleep_time)
                    self._open_sparql_store()
                    item_triples = list(
                        self._get_triples_from_store(
                            (URIRef(item["id"]), None, None)))

                X_temp = []
                qids_wikidata = wikidata_results_df.loc[
                    wikidata_results_df["id"] == item["id"], "qid"]

                if train:
                    item_qid = url_to_qid([
                        i for i in item_triples if i[0][1] == OWL.sameAs
                    ][0][0][-1])
                    y_item = [item_qid == qid for qid in qids_wikidata]

                id_pairs = [[item["id"], qid] for qid in qids_wikidata]

                # calculate instanceof distances
                try:
                    item_instanceof = [
                        url_to_qid(i[0][-1]) for i in item_triples
                        if i[0][1] == RDF.type
                    ]
                    wikidata_instanceof = wikidata_results_df.loc[
                        wikidata_results_df["id"] == item["id"],
                        "P31_and_P279"].tolist()

                    batch_instanceof_comparisons += [(
                        self._to_tuple(item_instanceof),
                        self._to_tuple(url_to_qid(q, raise_invalid=False)),
                    ) for q in wikidata_instanceof]
                except:  # noqa: E722
                    # TODO: better error handling here. Why does this fail?
                    logger.warning("Getting types for comparison failed.")

                    batch_instanceof_comparisons += [
                        (None, None) for q in range(
                            len(wikidata_results_df.loc[
                                wikidata_results_df["id"] == item["id"], :]))
                    ]

                for predicate, pid in predicate_pid_mapping.items():
                    item_values = [
                        i for i in item_triples if i[0][1] == URIRef(predicate)
                    ]

                    # RDFS.label is a special case that has no associated PID. We just want to compare it
                    # to the 'label' column which is the labels + aliases for each Wikidata item.
                    if predicate == RDFS.label:
                        item_labels = [
                            str(triple[0][-1]) for triple in item_values
                        ]
                        wikidata_labels = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            "label"].tolist()
                        sim_list = [
                            similarity_string(item_labels, label_list)
                            for label_list in wikidata_labels
                        ]

                    elif pid in pids_geographical:
                        item_values = self._to_tuple(
                            url_to_qid(
                                [triple[0][-1] for triple in item_values],
                                raise_invalid=False,
                            ))

                        wikidata_values = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            pid].tolist()

                        if len(item_values) == 0:
                            sim_list = [1] * len(wikidata_values)
                        else:
                            sim_list = [
                                get_distance_between_entities_multiple(
                                    {self._to_tuple(wiki_val), item_values},
                                    vertex_pid="P131",
                                    reciprocal=True,
                                ) for wiki_val in wikidata_values
                            ]

                    else:
                        wikidata_values = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            pid].tolist()
                        wikidata_labels = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            pid + "Label"].tolist()

                        if len(item_values) == 0:
                            # if the internal item has no values for the PID return zero similarity
                            # for this PID with each of the candidate QIDs
                            sim_list = [0] * len(wikidata_values)

                        else:
                            item_values = [
                                triple[0][-1] for triple in item_values
                            ]
                            item_values = flatten_list_of_lists([
                                self._replace_internal_id_with_sameas_or_label(
                                    val) if is_internal_uri(val) else val
                                for val in item_values
                            ])

                            if all([not bool(i) for i in item_values]):
                                sim_list = [0] * len(wikidata_values)

                            else:
                                if pid in pids_categorical:
                                    sim_list = [
                                        similarity_categorical(
                                            [str(i) for i in item_values],
                                            label,
                                            raise_on_diff_types=False,
                                        ) for label in wikidata_labels
                                    ]
                                else:
                                    sim_list = [
                                        compare(
                                            item_values,
                                            wikidata_values[i],
                                            wikidata_labels[i],
                                        ) for i in range(len(wikidata_values))
                                    ]

                    X_temp.append(sim_list)

                X_item = np.asarray(X_temp, dtype=np.float32).transpose()

                # TODO (checkpoint): here we would want to save X_list, y_list, id_pair_list, self.entity_distance_cache to disk
                X_list.append(X_item)

                if train:
                    y_list += y_item

                id_pair_list += id_pairs

            self._add_instanceof_distances_to_inmemory_cache(
                batch_instanceof_comparisons)

            for ent_1, ent_2 in batch_instanceof_comparisons:
                ent_similarity_list.append(self.entity_distance_cache[hash(
                    (ent_1, ent_2))])

        if train:
            X = np.column_stack([np.vstack(X_list), ent_similarity_list])
            y = np.asarray(y_list, dtype=bool)
            X_columns = list(predicate_pid_mapping.values()) + ["P31"]

            return X, y, X_columns, id_pair_list

        else:
            X = np.column_stack([np.vstack(X_list), ent_similarity_list])
            X_columns = list(predicate_pid_mapping.values()) + ["P31"]

            return X, X_columns, id_pair_list
Ejemplo n.º 25
0
        for o in srcgraph.objects(subject, p):
            destgraph.add((subject, p, o))
            if m:
                populate_predicate_objects(destgraph, srcgraph, m, o)


if __name__ == '__main__':

    # Allows to debug SPARQLWrapper queries to remote endpoint
    if 0:
        handler = urllib.request.HTTPHandler(debuglevel=1)
        opener = urllib.request.build_opener(handler)
        urllib.request.install_opener(opener)

    # Declare a remote RDFLib SPARQL Store
    store = SPARQLStore("http://dbpedia.org/sparql", context_aware=False)

    # Plug
    graph = Graph(store)

    # Load french films in DBPedia
    subject = DCTERMS.subject
    french_film = URIRef("http://dbpedia.org/resource/Category:French_films")

    # load every details we're interested in to the maingraph in memory
    # (actually, only the 50 first films)
    print("Select subjects for french films with SPARQLStore -> SPARQWrapper:")
    print("------------------------------------------------------------------")
    #for film in graph.subjects(subject, french_film):
    for film in islice(graph.subjects(subject, french_film), 50):
        print(film)
Ejemplo n.º 26
0
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from source_context import LTFSourceContext
from rdflib import URIRef, Literal
from rdflib.namespace import Namespace, RDF, SKOS, split_uri
from collections import namedtuple, Counter
import pickle
from setting import wikidata_endpoint, groundtruth_url
import requests
import debug
import json
import os
import tmp
import time_person_label
import re

wikidata_sparql = SPARQLStore(wikidata_endpoint)
AIDA = Namespace(
    'https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/InterchangeOntology#')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
namespaces = {'aida': AIDA, 'rdf': RDF, 'skos': SKOS, 'wdt': WDT}
types = namedtuple('AIDATypes',
                   ['Entity', 'Events', 'Relation'])(AIDA.Entity, AIDA.Event,
                                                     AIDA.Relation)


class Model:
    def __init__(self, sparql, repo, graph):
        self.__sparql = sparql
        self.__repo = repo
        self.__graph = graph
        pkl_file = 'pkl/' + repo
import datetime
import requests
from rdflib import *
import editdistance

import sys

service_url = 'http://localhost:8080/annotate/vlinking3/'

oboe = Namespace('http://ecoinformatics.org/oboe/oboe.1.1/oboe-core.owl#')
cmo = Namespace('http://purl.org/twc/ontologies/cmo.owl#')
skos = Namespace('http://www.w3.org/2004/02/skos/core#')
dbo = Namespace('http://dbpedia.org/ontology/')

from rdflib.plugins.stores.sparqlstore import SPARQLStore
dbpedia_store = SPARQLStore('http://dbpedia.org/sparql')
dbpedia = ConjunctiveGraph(dbpedia_store)


def extract_mentions(text):
    urls = collections.defaultdict(float)
    params = {
        'numResult': 20,
        #'minScore':5,
        'query': text
    }
    response = requests.get(service_url, params=params).json()
    for r in response['results']:
        for annotation in r['annotations']:
            urls[annotation['url']] += float(annotation['score'])
    urls = [(URIRef(url), score) for url, score in sorted(
Ejemplo n.º 28
0
if __name__ == "__main__":

    dbo = Namespace("http://dbpedia.org/ontology/")

    # EXAMPLE 1: using a Graph with the Store type string set to "SPARQLStore"
    graph = Graph("SPARQLStore", identifier="http://dbpedia.org")
    graph.open("http://dbpedia.org/sparql")

    pop = graph.value(URIRef("http://dbpedia.org/resource/Berlin"),
                      dbo.populationTotal)

    print("According to DBPedia, Berlin has a population of {0:,}".format(
        int(pop), ",d").replace(",", "."))

    # EXAMPLE 2: using a SPARQLStore object directly
    st = SPARQLStore(query_endpoint="http://dbpedia.org/sparql")

    for p in st.objects(URIRef("http://dbpedia.org/resource/Brisbane"),
                        dbo.populationTotal):
        print("According to DBPedia, Brisbane has a population of "
              "{0:,}".format(int(pop), ",d"))

    # EXAMPLE 3: doing RDFlib triple navigation using SPARQLStore as a Graph()
    graph = Graph("SPARQLStore", identifier="http://dbpedia.org")
    graph.open("http://dbpedia.org/sparql")
    # we are asking DBPedia for 3 skos:Concept instances
    count = 0
    from rdflib.namespace import RDF, SKOS
    for s in graph.subjects(predicate=RDF.type, object=SKOS.Concept):
        count += 1
        print(s)
from rdflib import Graph
from rdflib.plugins.stores.sparqlstore import SPARQLStore
import pandas as pd

data_graph = Graph(SPARQLStore("http://dbpedia.org/sparql", context_aware=False))
counter = 0

pages2 = []
results6 = data_graph.query("""
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dbpo: <http://dbpedia.org//>
select distinct ?Page where {?Page rdf:type yago:WikicatArtificialNeuralNetworks}
""")

for row in results6:
    str_r = str(row)
    if '(rdflib.term.URIRef("' in str_r:
        str_rq = str_r.split('"')
        pages2.append(str_rq[1])
    else:
        str_rq = str_r.split("'")
        pages2.append(str_rq[1])
    counter+=1

print(counter)

results7 = data_graph.query("""
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
Ejemplo n.º 30
0
 def __init__(self, *, endpoint: str):
     self.graph = SPARQLStore(endpoint)
Ejemplo n.º 31
0
 def __init__(self, endpoint):
     self.store = SPARQLStore(endpoint)
     self.dataset = Dataset()
Ejemplo n.º 32
0
class TwksClient:
    """
    Client for the TWKS server.

    The client mirrors the primary TWKS API: CRUD operations on nanopublications, querying assertions and nanopublications via SPARQL.
    """

    def __init__(self, *, server_base_url=None):
        """
        Construct a TWKS client.
        :param server_base_url: base URL of the server, excluding path e.g., http://localhost:8080"
        """
        if not server_base_url:
            server_base_url = "http://localhost:8080"
        self.__server_base_url = server_base_url
        assertions_sparql_query_endpoint = server_base_url + "/sparql/assertions"
        self.assertions_sparql_store = SPARQLStore(endpoint=assertions_sparql_query_endpoint, )
        # query_endpoint=assertions_sparql_query_endpoint)
        nanopublications_sparql_query_endpoint = server_base_url + "/sparql/nanopublications"
        self.nanopublications_sparql_store = SPARQLStore(endpoint=nanopublications_sparql_query_endpoint, )
        # query_endpoint=nanopublications_sparql_query_endpoint)

    def delete_nanopublication(self, nanopublication_uri: str) -> bool:
        """
        Delete a nanopublication by its URI
        :param nanopublication_uri: nanopublication URI
        :return: True if the nanopublication was deleted, else False
        """

        request = urllib.request.Request(url=self.__nanopublication_url(nanopublication_uri), method="DELETE")

        try:
            with urllib.request.urlopen(request) as _:
                return True
        except HTTPError as e:
            if e.code == 404:
                return False
            else:
                raise

    def dump(self) -> None:
        """
        Tell the server to dump the contents of the store to its (local) disk.
        """

        request = urllib.request.Request(url=self.__server_base_url + "/dump", method="POST")

        with urllib.request.urlopen(request) as _:
            return

    def get_assertions(self, store='default') -> rdflib.Graph:
        """
        Get the union of all assertions in the store, as a new Graph.
        :param store: store for the returned Graph
        """

        request = urllib.request.Request(url=self.__server_base_url + "/assertions", headers={"Accept": "text/trig"},
                                         method="GET")

        with urllib.request.urlopen(request) as f:
            response_trig = f.read()
            result = rdflib.Graph(store=store)
            result.parse(format="trig",
                         data=response_trig)
            return result

    def get_nanopublication(self, nanopublication_uri: str) -> Optional[Nanopublication]:
        """
        Get a nanopublication by its URI.
        :param nanopublication_uri: nanopublication URI
        :return: the nanopublication if present, else None
        """

        request = urllib.request.Request(url=self.__nanopublication_url(nanopublication_uri),
                                         headers={"Accept": "text/trig"})

        try:
            with urllib.request.urlopen(request) as f:
                response_trig = f.read()
                return Nanopublication.parse(format="trig",
                                             data=response_trig)
        except HTTPError as e:
            if e.code == 404:
                return None
            else:
                raise

    def get_ontology_assertions(self, ontology_uris: Set[URIRef], store='default') -> rdflib.Graph:
        """
        Get the union of all assertions in the store, as a new Graph.
        :param store: store for the returned Graph
        """

        if not ontology_uris:
            return rdflib.Graph(store=store)

        url = self.__server_base_url + "/assertions/ontology?" + urlencode(
            tuple(("uri", str(ontology_uri)) for ontology_uri in ontology_uris))
        # print(url)

        request = urllib.request.Request(url=url,
                                         headers={"Accept": "text/trig"},
                                         method="GET")

        with urllib.request.urlopen(request) as f:
            response_trig = f.read()
            result = rdflib.Graph(store=store)
            result.parse(format="trig",
                         data=response_trig)
            return result

    def __nanopublication_url(self, nanopublication_uri: str) -> str:
        return self.__server_base_url + "/nanopublication/" + quote(str(nanopublication_uri), safe="")

    def put_nanopublication(self, nanopublication: Nanopublication) -> None:
        """
        Put a nanopublication.

        :param nanopublication: the nanopublication
        """

        request = urllib.request.Request(url=self.__server_base_url + "/nanopublication",
                                         data=nanopublication.serialize(format="trig").encode("utf-8"),
                                         headers={"Content-Type": "text/trig; charset=utf-8"}, method="PUT")
        with urllib.request.urlopen(request) as _:
            pass

    def query_assertions(self, query: str, **kwds):
        """
        Query (only) the assertions in the store.
        :param query: SPARQL query string
        :param kwds: see rdflib.SPARQLStore.query
        :return: depends on query type
        """
        return self.assertions_sparql_store.query(query=query, **kwds)

    def query_nanopublications(self, query: str, **kwds):
        """
        Query all nanopublications in the store.
        :param query: SPARQL query string
        :param kwds: see rdflib.SPARQLStore.query
        :return: depends on query type
        """
        return self.nanopublications_sparql_store.query(query=query, **kwds)