Esempio n. 1
0
def test_generate():
    path = "tests"

    #dir_path = os.path.dirname(os.path.realpath(path)) + "/" + path
    dir_path = os.getcwd() + "/tests"
    src_path = dir_path + "/test2.turtle"
    dest_path = dir_path + "/test2.hdt"

    print("Source: %s" % src_path)
    print("Dest  : %s" % dest_path)

    doc = hdt.generate_hdt(src_path, "aff4://foo")
    retcode = doc.save_to_hdt(dest_path)
    assert retcode == 0

    document = HDTDocument(dest_path)

    (triples, triplesCard) = document.search_triples("missingtriple", "", "")
    assert triplesCard == 0

    (triples, triplesCard) = document.search_triples("", "", "")
    assert triplesCard == 12

    (triples, triplesCard) = document.search_triples(
        "aff4://5aea2dd0-32b4-4c61-a9db-677654be6f83//test_images/AFF4-L/dream.txt",
        "", "")
    assert triplesCard == 12

    os.unlink(dest_path)
Esempio n. 2
0
def extract_wikidata(classname, typeproperty):
    doc = HDTDocument("kg/wikidata-20170313-all-BETA.hdt")
    wd = "http://www.wikidata.org/entity/"
    wdt = "http://www.wikidata.org/prop/direct/"
    wd_classes = {
        "BoxerWikidata" : "Q11338576",
        "CyclistWikidata": "Q2309784",
        "CapitalWikidata" : "Q5119",
        "CountryWikidata" : "Q6256",
        "MetroAreaWikidata" : "Q1907114",
        "GeographicRegionWikidata" : "Q82794",
        "FilmFestivalWikidata" : "Q220505",
    }
    edgelist = []
    instances = set()
    (triples, count) = doc.search_triples("", f"{wdt}{typeproperty}", f"{wd}{wd_classes[classname]}")

    for triple in triples:
        instances.add(triple[0])

    for instance in tqdm(instances, total=len(instances)):
        (triples, count) = doc.search_triples(instance, "", "")
        for triple in triples:
            if not triple[1] in blacklist:
                edgelist.append((triple[0], triple[1]))

    return list(set(edgelist)) # Exclude duplicate entity-property relations
Esempio n. 3
0
    def __init__(self, dataset_name='lcquad'):
        '''
        Setup models, indices, embeddings and connection to the KG through the HDT API
        '''
        
        # connect to the entity and predicate catalogs
        self.e_index = IndexSearch('dbpedia201604e')
        self.p_index = IndexSearch('dbpedia201604p')

        # load embeddings
        self.word_vectors = load_embeddings(embeddings_path, embeddings_choice)
        self.p_vectors = load_embeddings(embeddings_path, 'fasttext_p_labels')
        
        # load pre-trained question type classification model
        with open(model_path+'qtype_lcquad_%s.pkl'%(embeddings_choice), 'rb') as f:
            self.model_settings = pkl.load(f)
        self.qt_model = build_qt_inference_model(self.model_settings)
        self.qt_model.load_weights(model_path+'_qtype_weights.best.hdf5', by_name=True)

        # load pre-trained question parsing model
        with open(model_path+'lcquad_%s.pkl'%(embeddings_choice), 'rb') as f:
            ep_model_settings = pkl.load(f)
        self.ep_model = build_ep_inference_model(ep_model_settings)
        # load weights
        # ep_model.load_weights('checkpoints/_'+modelname+'_weights.best.hdf5', by_name=True)
        self.ep_model.load_weights(model_path+'2hops-types.h5', by_name=True)

        # connect to the knowledge graph hdt file
        self.kg = HDTDocument(hdt_path+hdt_file)
Esempio n. 4
0
def extract_by_instance(fn, wdt_class, property, out=True):

    doc = HDTDocument(fn)

    wd = "http://www.wikidata.org/entity/"
    wdt = "http://www.wikidata.org/prop/direct/"

    properties = {"instance_of": "P31", "occupation": "P106"}

    instances = set()

    (triples, count) = doc.search_triples("", f"{wdt}{properties[property]}",
                                          f"{wd}{wdt_class}")

    for triple in tqdm(triples, len(list(instances))):
        instances.add(triple[0])

    with open(f'{wdt_class}.csv', "w") as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')

        for instance in tqdm(instances, total=len(instances)):
            if out:
                pattern = (instance, "", "")
            else:
                pattern = ("", "", instance)

            (triples, count) = doc.search_triples(*pattern)

            for triple in triples:
                if out:
                    spamwriter.writerow([triple[0], triple[1]])
                else:
                    spamwriter.writerow([triple[2], triple[1]])
Esempio n. 5
0
    def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ):
        self.hdt = HDTDocument(path_hdt)

        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)

        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)

        self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf"
        self.id_subPropertyOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
            IdentifierPosition.Predicate)

        self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty"
        self.id_equivalentProperty = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentProperty",
            IdentifierPosition.Predicate)

        self.graph = nx.DiGraph()

        self.equi_graph_manager = None  #equiClassManager(path_eq)
        print('set up the equivalence class manager')
        self.diagnosed_relations = []  # the result
        self.suggestion_on_relations = [
        ]  # from the manual decison and Joe's sameAs data. Triple
        self.leaf_classes = set()

        print('finished initialization')
Esempio n. 6
0
class HDT(KG):
    def __init__(self, hdt_file_path: str):
        self.hdt = HDTDocument(hdt_file_path)

    def predicate_objects(self, subject: str) -> Iterator[Tuple[str, str]]:
        (triples, cardinality) = self.hdt.search_triples(subject, "", "")
        for s, p, o in triples:
            yield p, o

    def subjects(self, predicate: str, obj: str) -> Iterator[str]:
        (triples, cardinality) = self.hdt.search_triples("", predicate, obj)
        for s, p, o in triples:
            yield s

    def triples(self, subject: str, predicate: str,
                obj: str) -> Iterator[Tuple[str, str, str]]:
        (triples,
         cardinality) = self.hdt.search_triples(subject, predicate, obj)
        for s, p, o in triples:
            yield (s, p, o)

    def objects(self, subject: str, predicate: str) -> Iterator[str]:
        (triples,
         cardinality) = self.hdt.search_triples(subject, predicate, "")
        for s, p, o in triples:
            yield o

    def count(self, subject: str, predicate: str, obj: str) -> int:
        (triples,
         cardinality) = self.hdt.search_triples(subject, predicate, obj)
        return cardinality

    def total_triples(self) -> int:
        return self.hdt.total_triples

    def nb_subjects(self) -> int:
        return self.hdt.nb_subjects

    def nb_predicates(self) -> int:
        return self.hdt.nb_predicates

    def nb_objects(self) -> int:
        return self.hdt.nb_objects

    def nb_shared(self) -> int:
        return self.hdt.nb_shared

    def get_schema_description(self, resource: str) -> Optional[str]:
        """Get english description of the specified resource.
        Use the http://schema.org/description property.
        Trailing double quotes and @en are removed!"""
        for o in self.objects(resource, "http://schema.org/description"):
            if o.endswith("@en"):
                # delete trailing @en and double quotes
                input_str = o[1:len(o) - 4]
                # input_str = re.sub(r'\d+', '', input_str) # remove numbers
                # input_str = input_str.translate(str.maketrans("","", string.punctuation)) # Punctuation removal
                # input_str = input_str.strip().lower() # To remove leading and ending spaces and put it in lower case
                return input_str
        return None
Esempio n. 7
0
 def __init__(self, hdt_path: Optional[str] = None,
              graph: Optional[HDTDocument] = None,
              redis_client: Optional[redis.Redis] = None):
     self.cache = redis_client
     if graph:
         self.graph = graph
     else:
         self.graph = HDTDocument(hdt_path, map=False, progress=True)
Esempio n. 8
0
def load_KG(path_file, predicate_string, orientation=True):
    # load the file according to the given predicate
    hdt_file = HDTDocument(path_file)
    (triples, cardinality) = hdt_file.search_triples('', predicate_string, '')
    for (s, _, o) in triples:
        if orientation:
            graph.add_edge(s, o)
        else:
            graph.add_edge(o, s)
Esempio n. 9
0
def get_nb_triples(file_path: str, format: str) -> int:
    if format == 'nt':
        return wccount(file_path)
    elif format == 'hdt':
        doc = HDTDocument(file_path, indexed=False)
        _, nb_triples = doc.search_triples("", "", "")
        return nb_triples
    else:
        raise Exception(f'Unsupported RDF format: "{format}"')
Esempio n. 10
0
def filter_answer_by_class(classes, answers_ids):
    classes_ids = [_id for e in classes for _id in e]
    kg = HDTDocument(hdt_path+hdt_file)
    a_ids = [_id for e in answers_ids for _id in e]
    a_ids = kg.filter_types(a_ids, classes_ids)
    kg.remove()
    a_ids = [_id for _a_ids in a_ids for _id in _a_ids]
    answers_ids = [{_id: a_score} for e in answers_ids for _id, a_score in e.items() if _id in a_ids]
    return answers_ids
Esempio n. 11
0
    def parsefile(self, file_path):
        """Parse an HDT file as an N-Triples file."""

        from hdt import HDTDocument

        doc = HDTDocument(file_path, indexed=False)
        iterator, _ = doc.search_triples("", "", "")
        self.iterator = iterator
        self.parse()
Esempio n. 12
0
    def __init__(self, wiki_filename: str, **kwargs) -> None:
        """

        Args:
            wiki_filename: hdt file with wikidata
            **kwargs:
        """
        log.debug(f'__init__ wiki_filename: {wiki_filename}')
        wiki_path = expand_path(wiki_filename)
        self.document = HDTDocument(str(wiki_path))
Esempio n. 13
0
    def __init__(self, **kwargs):

        self.__source = kwargs.get("file", None)
        if self.__source is None:
            raise TypeError()
        try:
            self.document = HDTDocument(self.__source)
        except Exception as e:
            logger.exception("Could not load HDT File from {}.".format(
                self.__source))
            raise e
        self.card = None
Esempio n. 14
0
def generate():
    hdt_file = None
    output_filename = None
    if sys.argv [1] == 'lod':
        hdt_file = HDTDocument(PATH_LOD)
        output_filename = 'all_lod_subClassOf.csv'

    else:
        hdt_file = HDTDocument(PATH_DBpedia)
        output_filename = 'all_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate)
    count = 0
    with open(output_filename, 'w', newline='') as file:
        (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")
        writer = csv.writer(file)
        writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
        for (s, p, o) in subclass_triples:
            # store it in a csv file
            s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
            o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
            writer.writerow([s_id, s, o_id, o])
            # print ([s_id, s, o_id, o])
            count += 1
    print ('total entries = ', count)
Esempio n. 15
0
 def __init__(self, path_hdt=PATH_LOD):
     self.hdt = HDTDocument(path_hdt)
     self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
     self.id_subClassOf = self.hdt.convert_term(
         "http://www.w3.org/2000/01/rdf-schema#subClassOf",
         IdentifierPosition.Predicate)
     self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
     self.id_equivalentClass = self.hdt.convert_term(
         "http://www.w3.org/2002/07/owl#equivalentClass",
         IdentifierPosition.Predicate)
     self.graph = nx.DiGraph()
     self.equi_graph = nx.Graph()
     self.diagnosed_relations = {}
     self.diagnosed_classes = {}
     self.leaf_classes = set()
Esempio n. 16
0
def extract_classes(fn):
    doc = HDTDocument(fn)

    rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
    types = set()
    types_dct = {}
    (triples, count) = doc.search_triples("", rdf_type, "")

    for triple in tqdm(triples, total=count):
        types.add(triple[2])

    for type in tqdm(types):
        (instances, instance_count) = doc.search_triples("", rdf_type, type)
        types_dct[type] = instance_count

    return types_dct
Esempio n. 17
0
def get_rdf_reader(file_path, format='nt'):
    """Get an iterator over RDF triples from a file"""
    iterator = None
    nb_triples = 0
    # load standard RDF formats using rdflib
    if format == 'nt' or format == 'ttl':
        g = Graph()
        g.parse(file_path, format=format)
        nb_triples = len(g)
        iterator = map(__n3_to_str, g.triples((None, None, None)))
    elif format == 'hdt':
        # load HDTDocument without additional indexes
        # they are not needed since we only search by "?s ?p ?o"
        doc = HDTDocument(file_path, indexed=False)
        iterator, nb_triples = doc.search_triples("", "", "")
    return iterator, nb_triples
Esempio n. 18
0
    def __init__(self,
                 wiki_filename: str,
                 lang: str = "@en",
                 **kwargs) -> None:
        """

        Args:
            wiki_filename: hdt file with wikidata
            lang: Russian or English language
            **kwargs:
        """
        log.debug(f'__init__ wiki_filename: {wiki_filename}')
        wiki_path = expand_path(wiki_filename)
        self.description_rel = "http://schema.org/description"
        self.lang = lang
        self.document = HDTDocument(str(wiki_path))
Esempio n. 19
0
    def loadMetadata(self, zip):
        # Load the turtle metadata.
        aff4cache = os.path.join(expanduser("~"), ".aff4")
        if not os.path.exists(aff4cache):
            try:
                os.makedirs(aff4cache)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        cached_turtle = os.path.join(aff4cache, "%s.hdt" % str(zip.urn)[7:])
        if not os.path.exists(cached_turtle):
            self.createHDTviaLib(zip, cached_turtle)

        if os.path.exists(cached_turtle):
            # assume we have a HDT cache of turtle at this point
            self.hdt = HDTDocument(cached_turtle)
Esempio n. 20
0
def load():
    output_filename = None
    if sys.argv [1] == 'lod':
        hdt_file = HDTDocument(PATH_LOD)
        output_filename = 'all_lod_subClassOf.csv'

    else:
        hdt_file = HDTDocument(PATH_DBpedia)
        output_filename = 'all_dbpedia_subClassOf.csv'

    with open(output_filename, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            print(row['SUBJECT_ID'], row['OBJECT_ID'])
            print(row['SUBJECT'], row['OBJECT'])
            print ('----------------------------------------------')
Esempio n. 21
0
def evaluate_subgraph_extraction(nhops,
                                 e_field,
                                 p_field,
                                 limit=None,
                                 show_errors=False):
    '''
    e_field, p_field <str> names of the fields in MongoDB to look up the IDs
    '''
    samples = mongo.get_sample(limit=limit)
    # iterate over the cursor
    accs = []
    for doc in samples:
        # get correct entities and predicates from the GS annotations
        e_ids = doc[e_field]
        p_uris = doc[p_field]

        # extract the subgraph
        kg = HDTDocument(hdt_path + hdt_file)
        kg.configure_hops(nhops, p_uris, namespace, True)
        entities, _, _ = kg.compute_hops(e_ids)
        kg.remove()

        # check if we hit the answer set
        if 'answers_ids' in doc:
            correct_answers_ids = set(doc['answers_ids'])
            #         print(correct_answers_ids)
            n_hits = len(correct_answers_ids & set(entities))
            # accuracy
            acc = float(n_hits) / len(correct_answers_ids)
            accs.append(acc)
            if show_errors & (acc < 1):
                print(doc['question'])
                print(doc['entity_ids'])
                print(doc['predicate_uris'])
    return accs
Esempio n. 22
0
class HDTFileConnector(DatabaseConnector):
    """A HDTFileConnector search for RDF triples in a HDT file"""
    def __init__(self, file):
        super(HDTFileConnector, self).__init__()
        self._hdt = HDTDocument(file)

    def search_triples(self, subject, predicate, obj, limit=0, offset=0):
        """
            Get an iterator over all RDF triples matching a triple pattern.

            Args:
                - subject ``string`` - Subject of the triple pattern
                - predicate ``string`` - Predicate of the triple pattern
                - object ``string`` - Object of the triple pattern
                - limit ``int=0`` ``optional`` -  LIMIT modifier, i.e., maximum number of RDF triples to read
                - offset ``int=0`` ``optional`` -  OFFSET modifier, i.e., number of RDF triples to skip

            Returns:
                A Python iterator over RDF triples matching the given triples pattern
        """
        subject = subject if (subject is not None) and (
            not subject.startswith('?')) else ""
        predicate = predicate if (predicate is not None) and (
            not predicate.startswith('?')) else ""
        obj = obj if (obj is not None) and (not obj.startswith('?')) else ""
        return self._hdt.search_triples(subject,
                                        predicate,
                                        obj,
                                        offset=offset,
                                        limit=limit)

    @property
    def nb_triples(self):
        return self._hdt.total_triples

    @property
    def nb_subjects(self):
        """Get the number of subjects in the database"""
        return self._hdt.nb_subjects

    @property
    def nb_predicates(self):
        """Get the number of predicates in the database"""
        return self._hdt.nb_predicates

    @property
    def nb_objects(self):
        """Get the number of objects in the database"""
        return self._hdt.nb_objects

    def from_config(config):
        """Build a HDTFileFactory from a config file"""
        if not os.path.isfile(config["file"]):
            raise Exception("Configuration file not found: {}".format(
                config["file"]))
        return HDTFileConnector(config["file"])
Esempio n. 23
0
    def __init__(self, wiki_filename: str, file_format: str = "hdt", lang: str = "@en", **kwargs) -> None:
        """

        Args:
            wiki_filename: file with Wikidata
            file_format: format of Wikidata file
            lang: Russian or English language
            **kwargs:
        """
        self.description_rel = "http://schema.org/description"
        self.file_format = file_format
        self.wiki_filename = str(expand_path(wiki_filename))
        if self.file_format == "hdt":
            self.document = HDTDocument(self.wiki_filename)
        elif self.file_format == "pickle":
            self.document = load_pickle(self.wiki_filename)
        else:
            raise ValueError("Unsupported file format")
        self.lang = lang
Esempio n. 24
0
def extract_dbpedia(superclass):
    """ Get edgelist for superclass and all its subclasses """
    edgelist = []
    instances = set()
    doc = HDTDocument(run.config["kg_source"])
    subject_limit = run.config["subject_limit"]
    predicate_limit = run.config["predicate_limit"]
    subclasses = query_subclasses(superclass)
    print("[Info] query instances for each subclass")
    for subclass in tqdm(subclasses):
        if subject_limit > 0:
            (triples, count) = doc.search_triples("", rdf + "type", subclass, limit=subject_limit)
        else:
            (triples, count) = doc.search_triples("", rdf + "type", subclass)
        for triple in triples:
            instances.add(triple[0])
    print("[Info] query predicates for each instance")
    for subject in tqdm(instances):
        if predicate_limit > 0:
            triples = doc.search_triples(subject, "", "", limit=predicate_limit)[0]
        else:
            (triples, count) = doc.search_triples(subject, "", "")
        for triple in triples:
            # Either blacklist
            if not triple[1] in blacklist:
                edgelist.append((triple[0], triple[1]))
            # Or whitelist
            # if triple[1] in whitelist:
            #     edgelist.append((triple[0], triple[1]))
    return list(set(edgelist)) # Exclude duplicate entity-property relations
Esempio n. 25
0
def get_rdf_reader(file_path, format='nt'):
    """Get an iterator over RDF triples from a file"""
    iterator = None
    nb_triples = 0
    # load using rdflib
    if format == 'ttl':
        g = Graph()
        g.parse(file_path, format=format)
        nb_triples = len(g)
        iterator = map(__n3_to_str, g.triples((None, None, None)))
    elif format == 'nt':
        print('Counting triples using the wc command...')
        total = wccount(file_path)
        print('The file contains {} triples.'.format(total))
        f = open(file_path, 'r')
        iter = yield_triples(f)
        return iter, total, f

    elif format == 'hdt':
        # load HDTDocument without additional indexes (not needed since we do a ?s ?p ?o)
        doc = HDTDocument(file_path, True, True)
        iterator, nb_triples = doc.search_triples_bytes("", "", "")
    return iterator, nb_triples
Esempio n. 26
0
    def __init__(self,
                 config,
                 hdt_file='wikidata2018_09_11.hdt',
                 topk_entities=10,
                 bottleneck_dim=32):
        super(MessagePassingHDTBert, self).__init__(config)

        # entity matching Transformer
        self.bert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.pre_classifier = nn.Linear(config.hidden_size, bottleneck_dim)
        self.classifier = nn.Linear(bottleneck_dim, self.config.num_labels)

        # initialise connection to the Wikidata KG through the HDT API
        kg = HDTDocument(hdt_path + hdt_file)
        # sampling layer with subgraph retrieval
        self.subgraph_sampling = SamplingLayer(kg, topk_entities)

        # predicted scores are propagated via MP layer into the entity subser distribution defined by the subgraph
        self.mp = MPLayer()

        self.init_weights()
Esempio n. 27
0
# hdt_iterators_test.py
# Author: Thomas MINIER - MIT License 2017-2019
import pytest
from hdt import HDTDocument

path = "tests/test.hdt"
document = HDTDocument(path)
nbTotalTriples = 132


def test_read_document_base():
    (triples, cardinality) = document.search_triples("", "", "")
    assert triples.subject == "?s"
    assert triples.predicate == "?p"
    assert triples.object == "?o"
    assert cardinality == nbTotalTriples
    for subj, pred, obj in triples:
        assert subj is not None
        assert pred is not None
        assert obj is not None
    assert triples.nb_reads == cardinality


def test_read_document_base_bytes():
    (triples, cardinality) = document.search_triples_bytes("", "", "")
    assert triples.subject == "?s"
    assert triples.predicate == "?p"
    assert triples.object == "?o"
    assert cardinality == nbTotalTriples
    for subj, pred, obj in triples:
        assert isinstance(subj, bytes)
Esempio n. 28
0
# coding: utf-8

import convex as cx
import requests
import time

import spacy
nlp = spacy.load("en_core_web_lg")


def get_nlp(sentence):
    return nlp(sentence)


from hdt import HDTDocument
hdt_wd = HDTDocument("data/kb/wikidata2018_09_11.hdt")

#questions = [
#    "Which actor voiced the Unicorn in The Last Unicorn?",
#    "And Alan Arkin was behind...?",
#    "And Alan Arkin be behind...? Why How when which was happy make fun",
#    "Who is the composer of the soundtrack?",
#    "So who performed the songs?",
#    "Genre of this band's music?",
#    "By the way, who was the director?"
#            ]
#
#q_test = str("Which actor voiced the Unicorn in The Last Unicorn? "+
#    "And Alan Arkin was behind...? "+
#    "And Alan Arkin be behind...? Why How when which was happy make fun. "+
#    "Who is the composer of the soundtrack? "+
Esempio n. 29
0
#!/usr/bin/env python3
# coding: utf-8

import csv
import urllib

from hdt import HDTDocument
import pandas as pd

from constants import SEP
from settings import (HDT_FILE, DATASET_FILE, OUTPUT_DATASET_FILE, STATS_FILE,
                      PREDICATES_EXCLUDED, QUERY, RATIO)
from functions import get_sujeto_atr, get_predicado_atr, get_objeto_atr

# HDTDocument creation
document = HDTDocument(HDT_FILE)

# Se hace la consulta de los triples en funcion del sujeto/predicado/objeto
(triples, cardinality) = document.search_triples("", "", QUERY)


def query(query):
    print("{}: {} objetos.".format(query, cardinality))

    #%% Procesamiento
    # triple = s p o
    lista_objetos = []
    for triple in triples:
        s, p, o = triple
        sujeto_descripcion, sujeto_URI = get_sujeto_atr(s)
        lista_objetos.append(sujeto_URI[1:-1])
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2

        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
        sp_adjacencies.append(adj)

    return np.asarray(sp_adjacencies)


from sklearn.preprocessing import normalize, binarize

kg = HDTDocument(hdt_path + hdt_file)


def hop(entities,
        constraints,
        top_predicates,
        verbose=False,
        max_triples=500000):
    '''
    Extract the subgraph for the selected entities
    '''
    #     print(top_predicates)
    n_constraints = len(constraints)
    if entities:
        n_constraints += 1