Ejemplo n.º 1
0
 def test_parse_shared_bnode_context_same_graph(self):
     bnode_ctx = dict()
     g = ConjunctiveGraph()
     g.parse(self.data_obnodes, format="nquads", bnode_context=bnode_ctx)
     o1 = set(g.objects())
     self.data_obnodes.seek(0)
     g.parse(self.data_obnodes, format="nquads", bnode_context=bnode_ctx)
     o2 = set(g.objects())
     self.assertEqual(o1, o2)
Ejemplo n.º 2
0
class ConstraintParser(object):
    def __init__(self, input_graphs):
        self.root = Node(OWL.Thing, [])
        self.g = ConjunctiveGraph()
        self.journal = {OWL.Thing: self.root}
        for path in input_graphs:
            print('Loading ', path)
            self.g.load(path, format='turtle')

    def get_node(self, uri):
        if uri in self.journal:
            return self.journal[uri]
        else:
            logging.debug('Creating node for : ' + unicode(uri))
            new_node = Node(uri, [])
            self.journal[uri] = new_node
            return new_node

    def get_all_children(self, uri):
        children = self.get_node(uri).children
        nodes_to_visit = children.copy()
        result = children.copy()
        childs_visited = 0
        while (len(nodes_to_visit) > 0):
            current_node = nodes_to_visit.pop()
            nodes_to_visit = nodes_to_visit.union(current_node.children)
            result.add(current_node)
            childs_visited += 1
            logging.debug("Child : " + str(childs_visited) + ' ' +
                          unicode(current_node))
        return result

    def parse_hierarchy(self):
        types = self.g.objects(None, RDF.type)
        all_super_classes = set()
        # first find upper classes under root
        for t in types:
            super_classes = list(self.g.objects(t, RDFS.subClassOf))
            all_super_classes = all_super_classes.union(set(super_classes))
            logging.debug(
                unicode(t) + ' has super classes: ' + ''.join(super_classes))
            if len(super_classes) == 0 or (len(super_classes) == 1
                                           and super_classes[0] == OWL.Thing):
                # super_class is root
                tmp_node = self.get_node(t)  # Node(t, [self.root])
                tmp_node.parents.add(self.root)
                self.root.children.add(tmp_node)
            else:
                tmp_node = self.get_node(t)
                tmp_node.parents.union(
                    set([self.get_node(p) for p in super_classes]))
                for p in super_classes:
                    self.get_node(p).children.add(tmp_node)
        logging.debug(all_super_classes)
Ejemplo n.º 3
0
def test_null_values_with_single_string():
    csvw = CSVW(csv_path="tests/null1.csv",
                metadata_path="tests/null1.single.csv-metadata.json")
    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    # There should be no subject NA
    all_subjects = {x for x in g.subjects()}
    assert subj_ns['null_key'] not in all_subjects
    assert subj_ns['1'] in all_subjects
    assert len(all_subjects) == 4

    # Null valued objects should not be created
    all_objects = {x for x in g.objects()}
    assert Literal('null_key', datatype=XSD.token) not in all_objects
    assert Literal('null_sector') not in all_objects
    assert Literal('null_id', datatype=XSD.token) not in all_objects
    assert Literal('PUBLIC') in all_objects
    assert Literal('12', datatype=XSD.token) in all_objects

    # Spot check some triples do not exist but other do from the same row
    null_key_lit = Literal('null_id', datatype=XSD.token)
    assert len(list(g.triples((subj_ns['2'], id_uri, null_key_lit)))) == 0

    priv_lit = Literal('PRIVATE')
    assert len(list(g.triples((subj_ns['2'], sect_uri, priv_lit)))) == 1

    null_sector_lit = Literal('null_sector')
    assert len(list(g.triples((subj_ns['3'], sect_uri, null_sector_lit)))) == 0

    twelve_lit = Literal('12', datatype=XSD.token)
    assert len(list(g.triples((subj_ns['3'], id_uri, twelve_lit)))) == 1
Ejemplo n.º 4
0
def verify_rdf(rdf_output):
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")
    assert len(g) == 6
    assert len(set(g.subjects())) == 2
    assert len(set(g.predicates())) == 3
    assert len(set(g.objects())) == 6
Ejemplo n.º 5
0
    def test_remove_period(self):
        with open(filepath('test-patch-remove-period.json')) as f:
            patch1 = f.read()
        with self.client as client:
            res = client.patch(
                '/d/',
                data=patch1,
                content_type='application/json',
                headers={'Authorization': 'Bearer '
                         + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'})
            patch_url = urlparse(res.headers['Location']).path
            res = client.post(
                patch_url + 'merge',
                buffered=True,
                headers={'Authorization': 'Bearer '
                         + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'})
            self.assertEqual(res.status_code, http.client.NO_CONTENT)
            removed_entities = database.get_removed_entity_keys()
            self.assertEqual(removed_entities, set(['p0trgkvwbjd']))
            res = client.get('/trgkvwbjd',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.GONE)
            res = client.get('/trgkvwbjd.json',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.GONE)
            res = client.get('/trgkvwbjd?version=0',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.NOT_FOUND)
            res = client.get('/trgkvwbjd.json?version=0',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.NOT_FOUND)
            res = client.get('/trgkvwbjd?version=1',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.OK)
            res = client.get('/trgkvwbjd.json?version=1',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.OK)

            res = client.get('/history.jsonld?inline-context')
            self.assertEqual(
                res.headers['Cache-Control'],
                'public, max-age=0')
            self.assertEqual(
                res.headers['X-Accel-Expires'],
                '{}'.format(cache.MEDIUM_TIME))

            g = ConjunctiveGraph()
            g.parse(format='json-ld', data=res.get_data(as_text=True))

            generated = list(g.objects(subject=HOST['h#change-2'],
                                       predicate=PROV.generated))
            self.assertEqual(len(generated), 1)
            self.assertIn(HOST['d?version=2'], generated)
Ejemplo n.º 6
0
def get_mediator_details(userid):
    #Get mediator_details - firstname, lastname, department, email
    details = {}
    details['userid'] = userid
    details['uri'] = None
    details['name'] = None
    details['fname'] = None
    details['lname'] = None
    details['title'] = None
    details['email'] = None
    details['dept'] = []
    if userid.startswith('uuid'):
        userid = get_mediator_account(userid)
        details['userid'] = userid
        if not userid:
            return details
    if not os.path.isfile(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)):
        return details
    graph = Graph()
    graph.parse(os.path.join(ag.mediatorsdir, '%s.rdf'%userid))
    t = ''
    f = ''
    l = ''
    for title in graph.objects(None, namespaces['foaf']['title']):
        if title.strip():
            t = title
            details['title'] = t
    for fname in graph.objects(None, namespaces['foaf']['firstName']):
        if fname.strip():
            f = fname
            details['fname'] = fname
    for lname in graph.objects(None, namespaces['foaf']['lastName']):
        if lname.strip():
            l = lname
            details['lname'] = lname
    details['name'] = "%s %s %s"%(t, f, l)
    details['name'] = details['name'].strip()
    if not details['name']:
        details['name'] = userid
    for email in graph.objects(None, namespaces['foaf']['mbox']):
        details['email'] = email
    for dept in graph.objects(None, namespaces['dcterms']['isPartOf']):
        details['dept'].append(dept)
    for uri in graph.subjects(namespaces['foaf']['account'], None):
        details['uri'] = uri
    return details
def get_uri_types(uri, lang):
    g = ConjunctiveGraph('SPARQLStore')
    g.open(get_dbpedia_endpoint(lang))

    #print uri
    #print len(list( g.triples(( URIRef(uri), URIRef('http://dbpedia.org/ontology/country'), URIRef('http://es.dbpedia.org/resource/España') )) ))

    return [ str(typ) for typ in g.objects(URIRef(uri), RDF.type) ]
Ejemplo n.º 8
0
def get_mediator_account(user_uuid):
    uri = URIRef("http://vocab.ox.ac.uk/owner/%s"%user_uuid)
    graph = Graph()
    graph.parse(ag.mediatorslist)
    for account in graph.objects(uri, namespaces['foaf']['account']):
        if account:
            return account
    return False
Ejemplo n.º 9
0
    def test_remove_definition(self):
        with open(filepath('test-patch-remove-definition.json')) as f:
            patch1 = f.read()
        with self.client as client:
            res = client.patch(
                '/d/',
                data=patch1,
                content_type='application/json',
                headers={'Authorization': 'Bearer '
                         + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'})
            patch_url = urlparse(res.headers['Location']).path
            res = client.post(
                patch_url + 'merge',
                headers={'Authorization': 'Bearer '
                         + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'})
            self.assertEqual(res.status_code, http.client.NO_CONTENT)
            removed_entities = database.get_removed_entity_keys()
            self.assertEqual(removed_entities, set(['p0trgkvwbjd']))
            res = client.get('/trgkvwbjd',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.GONE)
            res = client.get('/trgkvwbjd.json',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.GONE)
            res = client.get('/trgkvwbjd?version=0',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.NOT_FOUND)
            res = client.get('/trgkvwbjd.json?version=0',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.NOT_FOUND)
            res = client.get('/trgkvwbjd?version=1',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.OK)
            res = client.get('/trgkvwbjd.json?version=1',
                             headers={'Accept': 'application/json'},
                             follow_redirects=True)
            self.assertEqual(res.status_code, http.client.OK)

            res = client.get('/h')

            g = ConjunctiveGraph()
            g.parse(format='json-ld', data=res.get_data(as_text=True))

            invalidated = g.value(subject=PERIODO['p0h#change-2'],
                                  predicate=PROV.invalidated,
                                  any=False)
            self.assertEqual(invalidated, PERIODO['p0trgkvwbjd'])

            generated = list(g.objects(subject=PERIODO['p0h#change-2'],
                                       predicate=PROV.generated))
            self.assertEqual(len(generated), 2)
            self.assertIn(PERIODO['p0d?version=2'], generated)
            self.assertIn(PERIODO['p0trgkv?version=2'], generated)
Ejemplo n.º 10
0
def get_vocab_properties(vocabprefix):
    vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix)
    vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix)
    vocabstatusfile = os.path.join(vocabdir, "status.rdf")
    properties = {}
    properties['uri'] = vocab_uri
    if not os.path.isfile(vocabstatusfile):
        return properties
    properties['path'] = vocabdir
    properties['preferredNamespaceUri'] = None
    properties['preferredNamespacePrefix'] = None
    graph = Graph()
    graph.parse(vocabstatusfile)
    for o in graph.objects(None, namespaces['vann']['preferredNamespaceUri']):
        properties['preferredNamespaceUri'] = o
    for o in graph.objects(None, namespaces['vann']['preferredNamespacePrefix']):
        properties['preferredNamespacePrefix'] = o
    return properties
Ejemplo n.º 11
0
class Store:
    def __init__(self):
        self.graph = ConjunctiveGraph()
        if os.path.exists(storefn):
            self.graph.load(storeuri, format='n3')
        self.graph.bind('dc', 'http://purl.org/dc/elements/1.1/')
        self.graph.bind('foaf', 'http://xmlns.com/foaf/0.1/')
        self.graph.bind('imdb',
                        'http://www.csd.abdn.ac.uk/~ggrimnes/dev/imdb/IMDB#')
        self.graph.bind('rev', 'http://purl.org/stuff/rev#')

    def save(self):
        self.graph.serialize(storeuri, format='n3')

    def who(self, who=None):
        if who is not None:
            name, email = (r_who.match(who).group(1),
                           r_who.match(who).group(2))
            self.graph.add(
                (URIRef(storeuri), DC['title'], Literal(title % name)))
            self.graph.add(
                (URIRef(storeuri + '#author'), RDF.type, FOAF['Person']))
            self.graph.add(
                (URIRef(storeuri + '#author'), FOAF['name'], Literal(name)))
            self.graph.add(
                (URIRef(storeuri + '#author'), FOAF['mbox'], Literal(email)))
            self.save()
        else:
            return self.graph.objects(URIRef(storeuri + '#author'),
                                      FOAF['name'])

    def new_movie(self, movie):
        movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID)
        self.graph.add((movieuri, RDF.type, IMDB['Movie']))
        self.graph.add((movieuri, DC['title'], Literal(movie['title'])))
        self.graph.add((movieuri, IMDB['year'], Literal(int(movie['year']))))
        self.save()

    def new_review(self, movie, date, rating, comment=None):
        review = BNode(
        )  # @@ humanize the identifier (something like #rev-$date)
        movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID)
        self.graph.add(
            (movieuri, REV['hasReview'], URIRef('%s#%s' % (storeuri, review))))
        self.graph.add((review, RDF.type, REV['Review']))
        self.graph.add((review, DC['date'], Literal(date)))
        self.graph.add((review, REV['maxRating'], Literal(5)))
        self.graph.add((review, REV['minRating'], Literal(0)))
        self.graph.add((review, REV['reviewer'], URIRef(storeuri + '#author')))
        self.graph.add((review, REV['rating'], Literal(rating)))
        if comment is not None:
            self.graph.add((review, REV['text'], Literal(comment)))
        self.save()

    def movie_is_in(self, uri):
        return (URIRef(uri), RDF.type, IMDB['Movie']) in self.graph
Ejemplo n.º 12
0
class Store:
    def __init__(self):
        self.graph = ConjunctiveGraph()
        if os.path.exists(storefn):
            self.graph.load(storeuri, format="n3")
        self.graph.bind("dc", DC)
        self.graph.bind("foaf", FOAF)
        self.graph.bind("imdb", IMDB)
        self.graph.bind("rev", "http://purl.org/stuff/rev#")

    def save(self):
        self.graph.serialize(storeuri, format="n3")

    def who(self, who=None):
        if who is not None:
            name, email = (r_who.match(who).group(1),
                           r_who.match(who).group(2))
            self.graph.add(
                (URIRef(storeuri), DC["title"], Literal(title % name)))
            self.graph.add(
                (URIRef(storeuri + "#author"), RDF.type, FOAF["Person"]))
            self.graph.add(
                (URIRef(storeuri + "#author"), FOAF["name"], Literal(name)))
            self.graph.add(
                (URIRef(storeuri + "#author"), FOAF["mbox"], Literal(email)))
            self.save()
        else:
            return self.graph.objects(URIRef(storeuri + "#author"),
                                      FOAF["name"])

    def new_movie(self, movie):
        movieuri = URIRef("http://www.imdb.com/title/tt%s/" % movie.movieID)
        self.graph.add((movieuri, RDF.type, IMDB["Movie"]))
        self.graph.add((movieuri, DC["title"], Literal(movie["title"])))
        self.graph.add((movieuri, IMDB["year"], Literal(int(movie["year"]))))
        self.save()

    def new_review(self, movie, date, rating, comment=None):
        review = BNode(
        )  # @@ humanize the identifier (something like #rev-$date)
        movieuri = URIRef("http://www.imdb.com/title/tt%s/" % movie.movieID)
        self.graph.add(
            (movieuri, REV["hasReview"], URIRef("%s#%s" % (storeuri, review))))
        self.graph.add((review, RDF.type, REV["Review"]))
        self.graph.add((review, DC["date"], Literal(date)))
        self.graph.add((review, REV["maxRating"], Literal(5)))
        self.graph.add((review, REV["minRating"], Literal(0)))
        self.graph.add((review, REV["reviewer"], URIRef(storeuri + "#author")))
        self.graph.add((review, REV["rating"], Literal(rating)))
        if comment is not None:
            self.graph.add((review, REV["text"], Literal(comment)))
        self.save()

    def movie_is_in(self, uri):
        return (URIRef(uri), RDF.type, IMDB["Movie"]) in self.graph
Ejemplo n.º 13
0
def get_vocab_mediator(vocabprefix):
    vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix)
    vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix)
    vocabstatusfile = os.path.join(vocabdir, "status.rdf")
    mediators = {}
    if not os.path.isfile(vocabstatusfile):
        return mediators
    graph = Graph()
    graph.parse(vocabstatusfile)
    for o in graph.objects(None, namespaces['foaf']['account']):
        mediators[str(o)] = get_mediator_details(str(o))
    return mediators
Ejemplo n.º 14
0
def get_vocab_files(vocabprefix):
    #Get list of files for vocabulary
    vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix)
    vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix)
    vocabstatusfile = os.path.join(vocabdir, "status.rdf")
    vocab_files = {}
    if not os.path.isfile(vocabstatusfile):
        return vocab_files
    graph = Graph()
    graph.parse(vocabstatusfile)
    for v in graph.objects(None, namespaces['dcterms']['hasFormat']):
        v_str = str(v)
        vocab_files[v_str] = {'name':'', 'format':'', 'path':''}

        for f in graph.objects(URIRef(v), namespaces['dcterms']['format']):
            vocab_files[v_str]['format'] = str(f)
        for n in graph.objects(URIRef(v), namespaces['nfo']['fileName']):
            vocab_files[v_str]['name'] = str(n)
        for p in graph.objects(URIRef(v), namespaces['nfo']['fileUrl']):
            vocab_files[v_str]['path'] = str(p).replace('file://', '')
    return vocab_files
Ejemplo n.º 15
0
class Store:
    def __init__(self):
        self.graph = ConjunctiveGraph()
        if os.path.exists(storefn):
            self.graph.load(storeuri, format='n3')
        self.graph.bind('dc', DC)
        self.graph.bind('foaf', FOAF)
        self.graph.bind('imdb', IMDB)
        self.graph.bind('rev', 'http://purl.org/stuff/rev#')

    def save(self):
        self.graph.serialize(storeuri, format='n3')

    def who(self, who=None):
        if who is not None:
            name, email = (r_who.match(who).group(1), r_who.match(who).group(2))
            self.graph.add((URIRef(storeuri), DC['title'], Literal(title % name)))
            self.graph.add((URIRef(storeuri + '#author'), RDF.type, FOAF['Person']))
            self.graph.add((URIRef(storeuri + '#author'),
                            FOAF['name'], Literal(name)))
            self.graph.add((URIRef(storeuri + '#author'),
                            FOAF['mbox'], Literal(email)))
            self.save()
        else:
            return self.graph.objects(URIRef(storeuri + '#author'), FOAF['name'])

    def new_movie(self, movie):
        movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID)
        self.graph.add((movieuri, RDF.type, IMDB['Movie']))
        self.graph.add((movieuri, DC['title'], Literal(movie['title'])))
        self.graph.add((movieuri, IMDB['year'], Literal(int(movie['year']))))
        self.save()

    def new_review(self, movie, date, rating, comment=None):
        review = BNode()  # @@ humanize the identifier (something like #rev-$date)
        movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID)
        self.graph.add((movieuri, REV['hasReview'], URIRef('%s#%s' % (storeuri, review))))
        self.graph.add((review, RDF.type, REV['Review']))
        self.graph.add((review, DC['date'], Literal(date)))
        self.graph.add((review, REV['maxRating'], Literal(5)))
        self.graph.add((review, REV['minRating'], Literal(0)))
        self.graph.add((review, REV['reviewer'], URIRef(storeuri + '#author')))
        self.graph.add((review, REV['rating'], Literal(rating)))
        if comment is not None:
            self.graph.add((review, REV['text'], Literal(comment)))
        self.save()

    def movie_is_in(self, uri):
        return (URIRef(uri), RDF.type, IMDB['Movie']) in self.graph
Ejemplo n.º 16
0
def get_vocab_editorial_note(vocabprefix):
    vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix)
    vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix)
    vocabstatusfile = os.path.join(vocabdir, "status.rdf")
    msgs = []
    if not os.path.isfile(vocabstatusfile):
        return msgs
    graph = Graph()
    graph.parse(vocabstatusfile)
    for s, p, o in graph.triples((None, namespaces['skos']['editorialNote'], None)):
        nm = None
        for n in graph.objects(URIRef(s), namespaces['nfo']['fileName']):
            nm = str(n)
        msgs.append((str(o), nm))
    return msgs
Ejemplo n.º 17
0
def _get_thing_graph(td):
    g = td.resource.to_graph()

    def_g = ConjunctiveGraph(identifier=td.resource.node)
    for ns, uri in R.agora.fountain.prefixes.items():
        def_g.bind(ns, uri)

    for s, p, o in g:
        def_g.add((s, p, o))

    td_node = td.node

    if not list(def_g.objects(td.resource.node, CORE.describedBy)):
        def_g.add((td.resource.node, CORE.describedBy, td_node))
    return def_g
Ejemplo n.º 18
0
def get_vocab_description(vocabfile, vocabprefix):
    if not os.path.isfile(vocabfile):
        return {}
    graph = Graph()
    try:
        graph.parse(vocabfile)
    except:
        graph = None
        graph = Graph()
        try:
            graph.parse(vocabfile, format="n3")
        except:
            return {}
    descriptions = defaultdict(list)
    base = None
    properties = get_vocab_properties(vocabprefix)
    if 'preferredNamespaceUri' in properties and properties['preferredNamespaceUri']:
        base = properties['preferredNamespaceUri']
    else:
        (id, base, prefix) = get_vocab_base(vocabfile)
    if base:
        for k, predicates in vocab_description_uri.iteritems():
            for p in predicates:
                vals = None
                vals = graph.objects(URIRef(base), p)
                for val in vals:
                    if not val in descriptions[k]:
                        descriptions[k].append(val)
    for k, predicates in vocab_description.iteritems():
        if not k in descriptions or not descriptions[k]:
            for p in predicates:
                vals = graph.objects(None, p)
                for val in vals:
                    if not val in descriptions[k]:
                        descriptions[k].append(val)
    return dict(descriptions)
Ejemplo n.º 19
0
def get_influence_links():
    for wp_url in set(list(G.subjects())):
        m = re.match("https://en.wikipedia.org/wiki/(.+)", wp_url)
        if not m:
            continue
        title = m.group(1)
        dbpedia_url = URIRef('http://dbpedia.org/resource/%s' % title)
        dbp = ConjunctiveGraph()
        dbp.parse(dbpedia_url)

        for o in dbp.objects(dbpedia_url, dbpedia.influencedBy):
            m = re.match("http://dbpedia.org/resource/(.+)$", o)
            if not m: 
                continue
            wp_url2 = URIRef("https://en.wikipedia.org/wiki/" + m.group(1))
            if len(list(G.predicate_objects(wp_url2))) > 0:
                G.add((wp_url, dbpedia.influencedBy, wp_url2))
Ejemplo n.º 20
0
def get_influence_links():
    for wp_url in set(list(G.subjects())):
        m = re.match("https://en.wikipedia.org/wiki/(.+)", wp_url)
        if not m:
            continue
        title = m.group(1)
        dbpedia_url = URIRef('http://dbpedia.org/resource/%s' % title)
        dbp = ConjunctiveGraph()
        dbp.parse(dbpedia_url)

        for o in dbp.objects(dbpedia_url, dbpedia.influencedBy):
            m = re.match("http://dbpedia.org/resource/(.+)$", o)
            if not m:
                continue
            wp_url2 = URIRef("https://en.wikipedia.org/wiki/" + m.group(1))
            if len(list(G.predicate_objects(wp_url2))) > 0:
                G.add((wp_url, dbpedia.influencedBy, wp_url2))
Ejemplo n.º 21
0
def test_encoding_rdf():
    # With encoding specified
    encoding = "ISO-8859-1"
    csvw = CSVW(csv_path="./tests/iso_encoding.csv",
                metadata_path="./tests/iso_encoding.csv-metadata.json",
                csv_encoding=encoding)
    rdf_output = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    units = Namespace('http://example.org/units/')
    cars = Namespace('http://example.org/cars/')
    meta = Namespace("http://example.org/properties/")

    expected_unit = units[quote(u"\xb5100".encode('utf-8'))]
    assert (cars['1'], meta['UnitOfMeasurement'], expected_unit) in g
    assert expected_unit in list(g.objects())
Ejemplo n.º 22
0
    def test_included_schemas(self):
        model = ConjunctiveGraph()
        add_default_schemas(model)

        # rdf test
        s = [RDF, DC["title"], None]
        title = model.objects(RDF, DC["title"])
        self.assertTrue(title is not None)

        s = [RDF["Property"], RDF["type"], RDFS["Class"]]
        self.assertIn(s, model)

        # rdfs test
        s = [RDFS["Class"], RDF["type"], RDFS["Class"]]
        self.assertIn(s, model)

        s = [OWL["inverseOf"], RDF["type"], RDF["Property"]]
        self.assertIn(s, model)
Ejemplo n.º 23
0
def test_null_values_with_multiple_strings():
    csvw = CSVW(csv_path="tests/null1.csv",
                metadata_path="tests/null1.multiple.csv-metadata.json")
    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    all_objects = {x for x in g.objects()}

    assert Literal('null_key', datatype=XSD.token) not in all_objects
    assert Literal('null_sector') not in all_objects
    assert Literal('null_id', datatype=XSD.token) not in all_objects
    for id in ['10', '11', '12', '13']:
        assert Literal(id, datatype=XSD.token) not in all_objects

    all_preds = {x for x in g.predicates()}
    assert id_uri not in all_preds

    assert Literal('1', datatype=XSD.token) not in all_objects
Ejemplo n.º 24
0
    def handle(self, **options):
        _logger.debug("linking places")
        for place in models.Place.objects.filter(dbpedia__isnull=True):
            if not place.city or not place.state:
                continue

            # formulate a dbpedia place uri
            path = urllib2.quote('%s,_%s' %
                                 (_clean(place.city), _clean(place.state)))
            url = URIRef('http://dbpedia.org/resource/%s' % path)

            # attempt to get a graph from it
            graph = ConjunctiveGraph()
            try:
                _logger.debug("looking up %s" % url)
                graph.load(url)
            except urllib2.HTTPError, e:
                _logger.error(e)

            # if we've got more than 3 assertions extract some stuff from
            # the graph and save back some info to the db, would be nice
            # to have a triple store underneath where we could persist
            # all the facts eh?

            if len(graph) >= 3:
                place.dbpedia = url
                place.latitude = graph.value(url, geo['lat'])
                place.longitude = graph.value(url, geo['long'])
                for object in graph.objects(URIRef(url), owl['sameAs']):
                    if object.startswith('http://sws.geonames.org'):
                        place.geonames = object
                place.save()
                _logger.info("found dbpedia resource %s" % url)
            else:
                _logger.warn("couldn't find dbpedia resource for %s" % url)

            reset_queries()
Ejemplo n.º 25
0
    def handle(self, **options):
        _logger.debug("linking places")
        for place in models.Place.objects.filter(dbpedia__isnull=True):
            if not place.city or not place.state:
                continue

            # formulate a dbpedia place uri
            path = urllib2.quote('%s,_%s' % (_clean(place.city), 
                                             _clean(place.state)))
            url = URIRef('http://dbpedia.org/resource/%s' % path)

            # attempt to get a graph from it
            graph = ConjunctiveGraph()
            try: 
                _logger.debug("looking up %s" % url)
                graph.load(url)
            except urllib2.HTTPError, e:
                _logger.error(e)

            # if we've got more than 3 assertions extract some stuff from 
            # the graph and save back some info to the db, would be nice
            # to have a triple store underneath where we could persist
            # all the facts eh?

            if len(graph) >= 3:
                place.dbpedia = url
                place.latitude = graph.value(url, geo['lat'])
                place.longitude = graph.value(url, geo['long'])
                for object in graph.objects(URIRef(url), owl['sameAs']):
                    if object.startswith('http://sws.geonames.org'):
                        place.geonames = object
                place.save()
                _logger.info("found dbpedia resource %s" % url)
            else:
                _logger.warn("couldn't find dbpedia resource for %s" % url)

            reset_queries()
Ejemplo n.º 26
0
class Inspector(object):

    """ Class that includes methods for querying an RDFS/OWL ontology """

    def __init__(self, uri, language=""):
        super(Inspector, self).__init__()
        self.rdfGraph = ConjunctiveGraph()
        try:
            self.rdfGraph.parse(uri, format="application/rdf+xml")
        except:
            try:
                self.rdfGraph.parse(uri, format="n3")
            except:
                raise exceptions.Error("Could not parse the file! Is it a valid RDF/OWL ontology?")
        finally:
            self.baseURI = self.get_OntologyURI() or uri
            self.allclasses = self.__getAllClasses(includeDomainRange=True, includeImplicit=True, removeBlankNodes=False, excludeRDF_OWL=False)

    def get_OntologyURI(self, return_as_string=True):
        test = [x for x, y, z in self.rdfGraph.triples((None, RDF.type, Ontology))]
        if test:
            if return_as_string:
                return str(test[0])
            else:
                return test[0]
        else:
            return None

    def __getAllClasses(self, classPredicate="", includeDomainRange=False, includeImplicit=False, removeBlankNodes=True, addOWLThing=True, excludeRDF_OWL=True):

        rdfGraph = self.rdfGraph
        exit = {}

        def addIfYouCan(x, mydict):
            if excludeRDF_OWL:
                if x.startswith('http://www.w3.org/2002/07/owl#') or  \
                   x.startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") or \
                   x.startswith("http://www.w3.org/2000/01/rdf-schema#"):
                    return mydict
            if x not in mydict:
                mydict[x] = None
            return mydict

        if addOWLThing:
            exit = addIfYouCan(Thing, exit)

        if classPredicate == "rdfs" or classPredicate == "":
            for s in rdfGraph.subjects(RDF.type, RDFS.Class):
                exit = addIfYouCan(s, exit)

        if classPredicate == "owl" or classPredicate == "":
            for s in rdfGraph.subjects(RDF.type, Class):
                exit = addIfYouCan(s, exit)

        if includeDomainRange:
            for o in rdfGraph.objects(None, RDFS.domain):
                exit = addIfYouCan(o, exit)
            for o in rdfGraph.objects(None, RDFS.range):
                exit = addIfYouCan(o, exit)

        if includeImplicit:
            for s, v, o in rdfGraph.triples((None, RDFS.subClassOf, None)):
                exit = addIfYouCan(s, exit)
                exit = addIfYouCan(o, exit)
            for o in rdfGraph.objects(None, RDF.type):
                exit = addIfYouCan(o, exit)

        # get a list
        exit = exit.keys()
        if removeBlankNodes:
            exit = [x for x in exit if not isBlankNode(x)]
        return sort_uri_list_by_name(exit)

    def __getTopclasses(self, classPredicate=''):
        returnlist = []

        for eachclass in self.__getAllClasses(classPredicate):
            x = self.get_classDirectSupers(eachclass)
            if not x:
                returnlist.append(eachclass)
        return sort_uri_list_by_name(returnlist)

    def __getTree(self, father=None, out=None):
        if not father:
            out = {}
            topclasses = self.toplayer
            out[0] = topclasses

            for top in topclasses:
                children = self.get_classDirectSubs(top)
                out[top] = children
                for potentialfather in children:
                    self.__getTree(potentialfather, out)

            return out

        else:
            children = self.get_classDirectSubs(father)
            out[father] = children
            for ch in children:
                self.__getTree(ch, out)

    def __buildClassTree(self, father=None, out=None):
        if not father:
            out = {}
            topclasses = self.toplayer
            out[0] = [Thing]
            out[Thing] = sort_uri_list_by_name(topclasses)
            for top in topclasses:
                children = self.get_classDirectSubs(top)
                out[top] = sort_uri_list_by_name(children)
                for potentialfather in children:
                    self.__buildClassTree(potentialfather, out)
            return out
        else:
            children = self.get_classDirectSubs(father)
            out[father] = sort_uri_list_by_name(children)
            for ch in children:
                self.__buildClassTree(ch, out)

    # methods for getting ancestores and descendants of classes: by default, we do not include blank nodes
    def get_classDirectSupers(self, aClass, excludeBnodes=True, sortUriName=False):
        returnlist = []
        for o in self.rdfGraph.objects(aClass, RDFS.subClassOf):
            if not (o == Thing):
                if excludeBnodes:
                    if not isBlankNode(o):
                        returnlist.append(o)
                else:
                    returnlist.append(o)
        if sortUriName:
            return sort_uri_list_by_name(remove_duplicates(returnlist))
        else:
            return remove_duplicates(returnlist)

    def get_classDirectSubs(self, aClass, excludeBnodes=True):
        returnlist = []
        for s, v, o in self.rdfGraph.triples((None, RDFS.subClassOf, aClass)):
            if excludeBnodes:
                if not isBlankNode(s):
                    returnlist.append(s)
            else:
                returnlist.append(s)
        return sort_uri_list_by_name(remove_duplicates(returnlist))

    def get_classSiblings(self, aClass, excludeBnodes=True):
        returnlist = []
        for father in self.get_classDirectSupers(aClass, excludeBnodes):
            for child in self.get_classDirectSubs(father, excludeBnodes):
                if child != aClass:
                    returnlist.append(child)

        return sort_uri_list_by_name(remove_duplicates(returnlist))

    def entitySynonyms(self, anEntity, language=DEFAULT_LANGUAGE, getall=True):
        if getall:
            temp = []
            # Uberon synonyms
            for o in self.rdfGraph.objects(anEntity, Synonym):
                temp += [o]
            # EFO synonyms
            for o in self.rdfGraph.objects(anEntity, EFO_Synonym):
                temp += [o]
            # OBI synonyms
            for o in self.rdfGraph.objects(anEntity, OBO_Synonym):
                temp += [o]
            return temp
        else:
            for o in self.rdfGraph.objects(anEntity, Synonym):
                if getattr(o, 'language') and getattr(o, 'language') == language:
                    return o
            return ""

    def classFind(self, name, exact=False):
        temp = []
        if name:
            for x in self.allclasses:
                if exact:
                    if x.__str__().lower() == str(name).lower():
                        return [x]
                else:
                    if x.__str__().lower().find(str(name).lower()) >= 0:
                        temp.append(x)
        return temp
Ejemplo n.º 27
0
# step 1: find all the classes.
rdftype = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
rdfsdomain = URIRef("http://www.w3.org/2000/01/rdf-schema#domain")
rdfsrange = URIRef("http://www.w3.org/2000/01/rdf-schema#range")
rdfsresource = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Resource")
rdfssco = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")
asColl = URIRef("http://www.w3.org/ns/activitystreams#OrderedCollection")
skosConcept = URIRef("http://www.w3.org/2004/02/skos/core#Concept")

otherClasses = [asColl, skosConcept]
classes = list(g.subjects(rdftype, URIRef("http://www.w3.org/2000/01/rdf-schema#Class")))
props = list(g.subjects(rdftype, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Property")))

for p in props:
    domains = list(g.objects(p, rdfsdomain))
    for d in domains:
        assert(d in classes)

for p in props:
    ranges = list(g.objects(p, rdfsrange))
    for r in ranges:
        if not r in classes and not str(r).startswith("http://www.w3.org/2001/XMLSchema#") and \
            not r == rdfsresource:
            print "Found inconsistent property: %s has unknown range" % p

for c in classes:
    parents = list(g.objects(c, rdfssco))
    for p in parents:
        if not p in classes and not p in otherClasses:
            print "Found inconsistent class: %s has unknown superClass" % c
Ejemplo n.º 28
0
    def __load_citations_from_rdf_file(data_f_path, prov_f_path, service_name,
                                       id_type, id_shape, citation_type):
        citation_data = Graph()
        citation_data.load(data_f_path, format="nt11")

        citation_prov = ConjunctiveGraph()
        citation_prov.load(prov_f_path, format="nquads")

        for cit_ent in citation_data.subjects(RDF.type, Citation.citation):
            prov_entity = None
            snapshot = 0

            for entity in citation_prov.subjects(Citation.specialization_of,
                                                 cit_ent):
                entity_snapshot = int(sub("^.+/se/(.+)$", "\\1", entity))
                if prov_entity is None or snapshot < entity_snapshot:
                    prov_entity = entity
                    snapshot = entity_snapshot

            invalidated = None
            update = None
            creation_date = None
            timespan = None
            for en in citation_prov.objects(prov_entity,
                                            Citation.invalidated_at_time):
                invalidated = str(en)
            for en in citation_prov.objects(prov_entity,
                                            Citation.has_update_query):
                update = str(en)
            for en in citation_data.objects(
                    cit_ent, Citation.has_citation_creation_date):
                creation_date = str(en)
            for en in citation_data.objects(cit_ent,
                                            Citation.has_citation_time_span):
                timespan = str(en)

            c = Citation(
                sub("^.+/ci/(.+)$", "\\1", str(cit_ent)),
                str(
                    list(
                        citation_data.objects(cit_ent,
                                              Citation.has_citing_entity))[0]),
                None,
                str(
                    list(
                        citation_data.objects(cit_ent,
                                              Citation.has_cited_entity))[0]),
                None, creation_date, timespan, entity_snapshot,
                str(
                    list(
                        citation_prov.objects(prov_entity,
                                              Citation.was_attributed_to))[0]),
                str(
                    list(
                        citation_prov.objects(
                            prov_entity, Citation.had_primary_source))[0]),
                str(
                    list(
                        citation_prov.objects(prov_entity,
                                              Citation.generated_at_time))[0]),
                service_name, id_type, id_shape, citation_type,
                Citation.journal_self_citation in citation_data.objects(
                    cit_ent, RDF.type), Citation.author_self_citation
                in citation_data.objects(cit_ent, RDF.type), invalidated,
                str(
                    list(
                        citation_prov.objects(prov_entity,
                                              Citation.description))[0]),
                update)

            yield c
Ejemplo n.º 29
0
class Owler(object):

    """ Class that includes methods for building an RDF graph from an OWL ontology
        and retrieving information from it """

    def __init__(self, uri, language=""):
        super(Owler, self).__init__()
        self.rdfGraph = ConjunctiveGraph()
        try:
            self.rdfGraph.parse(uri, format="application/rdf+xml")
        except:
            try:
                self.rdfGraph.parse(uri, format="n3")
            except:
                raise exceptions.Error("Could not parse the file! Is it a valid RDF/OWL ontology?")
        finally:
            self.baseURI = self.__get_OntologyURI() or uri
            self.allclasses = self.__getAllClasses(includeDomainRange=True, includeImplicit=True, removeBlankNodes=False, excludeRDF_OWL=False)

    def __get_OntologyURI(self, return_as_string=True):
        test = [x for x, y, z in self.rdfGraph.triples((None, RDF.type, Ontology))]
        if test:
            if return_as_string:
                return str(test[0])
            else:
                return test[0]
        else:
            return None

    def __getAllClasses(self, classPredicate="", includeDomainRange=False, includeImplicit=False, removeBlankNodes=True, addOWLThing=True, excludeRDF_OWL=True):

        rdfGraph = self.rdfGraph
        exit = {}

        def addIfYouCan(x, mydict):
            if excludeRDF_OWL:
                if x.startswith('http://www.w3.org/2002/07/owl#') or  \
                   x.startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") or \
                   x.startswith("http://www.w3.org/2000/01/rdf-schema#"):
                    return mydict
            if x not in mydict:
                mydict[x] = None
            return mydict

        if addOWLThing:
            exit = addIfYouCan(Thing, exit)

        if classPredicate == "rdfs" or classPredicate == "":
            for s in rdfGraph.subjects(RDF.type, RDFS.Class):
                exit = addIfYouCan(s, exit)

        if classPredicate == "owl" or classPredicate == "":
            for s in rdfGraph.subjects(RDF.type, Class):
                exit = addIfYouCan(s, exit)

        if includeDomainRange:
            for o in rdfGraph.objects(None, RDFS.domain):
                exit = addIfYouCan(o, exit)
            for o in rdfGraph.objects(None, RDFS.range):
                exit = addIfYouCan(o, exit)

        if includeImplicit:
            for s, v, o in rdfGraph.triples((None, RDFS.subClassOf, None)):
                exit = addIfYouCan(s, exit)
                exit = addIfYouCan(o, exit)
            for o in rdfGraph.objects(None, RDF.type):
                exit = addIfYouCan(o, exit)

        # get a list
        exit = exit.keys()
        if removeBlankNodes:
            exit = [x for x in exit if not isBlankNode(x)]
        return sortUriListByName(exit)

    # methods for getting ancestors and descendants of classes: by default, we do not include blank nodes
    def get_classDirectSupers(self, aClass, excludeBnodes=True, sortUriName=False):
        returnlist = []
        for o in self.rdfGraph.objects(aClass, RDFS.subClassOf):
            if not (o == Thing):
                if excludeBnodes:
                    if not isBlankNode(o):
                        returnlist.append(o)
                else:
                    returnlist.append(o)
        if sortUriName:
            return sortUriListByName(removeDuplicates(returnlist))
        else:
            return removeDuplicates(returnlist)
Ejemplo n.º 30
0
    from pprint import pprint

    print("All the things in the Graph:")
    pprint(list(primer))

    # just think .whatever((s, p, o))
    # here we report on what we know

    print("==================")

    print("Subjects:")
    pprint(list(primer.subjects()))
    print("Predicates:")
    pprint(list(primer.predicates()))
    print("Objects:")
    pprint(list(primer.objects()))

    print("==================")
    # and other things that make sense

    print("What we know about pat:")
    pprint(list(primer.predicate_objects(myNS.pat)))

    print("Who is what age?")
    pprint(list(primer.subject_objects(myNS.age)))

    print("==================")
    print("==================")

    # Okay, so lets now work with a bigger
    # dataset from the example, and start
Ejemplo n.º 31
0
    def handle(self, **options):
        LOGGER.debug("linking places")
        for place in models.Place.objects.filter(dbpedia__isnull=True):
            if not place.city or not place.state:
                continue

            # formulate a dbpedia place uri
            path = urllib2.quote('%s,_%s' %
                                 (_clean(place.city), _clean(place.state)))
            url = URIRef('http://dbpedia.org/resource/%s' % path)

            # attempt to get a graph from it
            graph = ConjunctiveGraph()
            try:
                LOGGER.debug("looking up %s" % url)
                graph.load(url)
            except urllib2.HTTPError as e:
                LOGGER.error(e)

            # if we've got more than 3 assertions extract some stuff from
            # the graph and save back some info to the db, would be nice
            # to have a triple store underneath where we could persist
            # all the facts eh?

            if len(graph) >= 3:
                place.dbpedia = url
                place.latitude = graph.value(url, geo['lat'])
                place.longitude = graph.value(url, geo['long'])
                for object in graph.objects(URIRef(url), owl['sameAs']):
                    if object.startswith('http://sws.geonames.org'):
                        place.geonames = object
                place.save()
                LOGGER.info("found dbpedia resource %s" % url)
            else:
                LOGGER.warn("couldn't find dbpedia resource for %s" % url)

            reset_queries()
        LOGGER.info("finished looking up places in dbpedia")

        LOGGER.info("dumping place_links.json fixture")

        # so it would be nice to use django.core.serializer here
        # but it serializes everything about the model, including
        # titles that are linked to ... and this could theoretically
        # change over time, so we only preserve the facts that have
        # been harvested from dbpedia, so they can overlay over
        # the places that have been extracted during title load

        json_src = []
        places_qs = models.Place.objects.filter(dbpedia__isnull=False)
        for p in places_qs.order_by('name'):
            json_src.append({
                'name': p.name,
                'dbpedia': p.dbpedia,
                'geonames': p.geonames,
                'longitude': p.longitude,
                'latitude': p.latitude
            })
            reset_queries()
        json.dump(json_src,
                  file('core/fixtures/place_links.json', 'w'),
                  indent=2)
        LOGGER.info("finished dumping place_links.json fixture")
Ejemplo n.º 32
0
Archivo: skosify.py Proyecto: edsu/lcco
            range = class_match.group(1)
            label = class_match.group(2)
            parts = re.split(r' +', label)
            label = ' '.join(l.lower().capitalize() for l in parts).strip()
            position = 0
        else:
            parts = line.split("\t")
            label = parts.pop().strip()
            range = parts.pop(0).strip()
            position = len(parts) + 1

        # if there's no range then we've got a chunk of text that needs 
        # to be added to the last concept we added to the graph 
        if not range:
            uri = range_uri(lc_class[-1][0])
            old_label = list(g.objects(uri, SKOS.prefLabel))[0]
            new_label = "%s %s" % (old_label, label)
            g.remove((uri, SKOS.prefLabel, old_label))
            g.add((uri, SKOS.prefLabel, Literal(new_label, 'en')))
            continue

        lc_class = lc_class[0:position]
        lc_class.insert(position, (range, label))

        label = '--'.join([c[1] for c in lc_class])
        uri = range_uri(range)

        g.add((uri, RDF.type, SKOS.Concept))
        g.add((uri, SKOS.prefLabel, Literal(label, 'en')))
        g.add((uri, SKOS.notation, Literal(range, datatype=LCC)))
Ejemplo n.º 33
0
class Store:

    def __init__(self, 
            storefile  = config['STORE_FILE'],
            storeuri   = config['STORE_URI'], 
            structfile = config['STRUCT_FILE'],
            structuri  = config['STRUCT_URI'],
            namespaces = config['NAMESPACES'],
            sensors    = config['SENSORS'],
            format     = config['FORMAT']):

        self.storeuri   = storeuri
        self.storefile  = storefile
        self.structuri  = structuri
        self.structfile = structfile
        self.namespaces = namespaces
        self.sensors    = sensors
        self.format     = format

        for namespace, uri in namespaces.iteritems():
            self.namespaces[namespace] = uri

        self.reset()

    def reset(self, new_graph=None):
        if new_graph is not None:
            self.graph = new_graph	
        else:
            self.graph = ConjunctiveGraph()
        for namespace, uri in self.namespaces.iteritems():
            self.graph.bind(namespace, uri)
        if os.path.exists(self.structfile):
            self.graph.load(self.structuri, format=self.format)
        if os.path.exists(self.storefile):
            self.graph.load(self.storeuri, format=self.format)

    def save(self, format=None):
        if not format: format = self.format
        self.graph.serialize(self.storeuri, format=format)

    def get(self, something): 
        pass

    def snapshot(self):
        self.reset()
        for sensor in self.sensors:
            constructor = globals()[sensor]
            instance = constructor()
            instance.snapshot()
            self.graph = self.graph + instance.graph

    @property
    def queries(self):
        q = self.basequeries
        for sensor in self.sensors:
            constructor = globals()[sensor]
            q = dict(q, **constructor.queries)
        return q

    @property
    def local(self):
        """ Idee: local config only """
        hostname = getHostname()
        query = "?m a :Machine . ?m location %s" % hostname
        return hostname

    def on(self, ip=None):
        """ Idee: store.on('192.168.1.43').query('SELECT..') """

    def query(self, sparql):
        return self.graph.query(sparql)

    def register(self):
        name = getHostname()
        location = ""



    def who(self, who=None):
        if who is not None:
            name, email = (r_who.match(who).group(1), r_who.match(who).group(2))
            self.graph.add((URIRef(storeuri), DC['title'], Literal(title % name)))
            self.graph.add((URIRef(storeuri+'#author'), RDF.type, FOAF['Person']))
            self.graph.add((URIRef(storeuri+'#author'), FOAF['name'], Literal(name)))
            self.graph.add((URIRef(storeuri+'#author'), FOAF['mbox'], Literal(email)))
            self.save()
        else:
            return self.graph.objects(URIRef(storeuri+'#author'), FOAF['name'])

    def new_movie(self, movie):
        movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID)
        self.graph.add((movieuri, RDF.type, IMDB['Movie']))
        self.graph.add((movieuri, DC['title'], Literal(movie['title'])))
        self.graph.add((movieuri, IMDB['year'], Literal(int(movie['year']))))
        self.save()

    def new_review(self, movie, date, rating, comment=None):
        review = BNode() # @@ humanize the identifier (something like #rev-$date)
        movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID)
        self.graph.add((movieuri, REV['hasReview'], URIRef('%s#%s' % (storeuri, review))))
        self.graph.add((review, RDF.type, REV['Review']))
        self.graph.add((review, DC['date'], Literal(date)))
        self.graph.add((review, REV['maxRating'], Literal(5)))
        self.graph.add((review, REV['minRating'], Literal(0)))
        self.graph.add((review, REV['reviewer'], URIRef(storeuri+'#author')))
        self.graph.add((review, REV['rating'], Literal(rating)))
        print comment
        if comment is not None:
            self.graph.add((review, REV['text'], Literal(comment)))
        self.save()

    def movie_is_in(self, uri):
        return (URIRef(uri), RDF.type, IMDB['Movie']) in self.graph

    basequeries = {
        "Trifle Entities": """
            SELECT ?Subject ?Object 
            WHERE { ?Subject rdfs:subClassOf ?Object }
        """
    }
Ejemplo n.º 34
0
  def handle(self, *args, **options):
    es = settings.ELASTIC_SEARCH_URL
    db = os.path.join(settings.BASE_DIR, "db")
    print(es)

    graph = ConjunctiveGraph('Sleepycat')
    graph.open(db, create=False)
    graph.bind('skos', SKOS)

    EU = Namespace('http://eurovoc.europa.eu/schema#')
    UNBIST = Namespace('http://unontologies.s3-website-us-east-1.amazonaws.com/unbist#')

    querystring = "select ?uri where { ?uri rdf:type skos:Concept filter not exists { ?uri rdf:type unbist:PlaceName } . }"

    index = 1
    
    # make the index:
    thes_index = {
      "mappings": {
        "terms": {
          "properties": {
            "scope_notes": {
              "type": "string"
            },
            "uri": {
              "type": "string"
            },
            "alt_labels": {
              "type": "string"
            },
            "alt_labels_orig": {
              "type": "string",
              "index": "not_analyzed"
            },
            "labels": {
              "type": "string"
            },
            "labels_orig": {
              "type": "string",
              "index": "not_analyzed"
            }
          }
        }
      }
    }
    
    r = requests.put(es + 'thesaurus/', data=json.dumps(thes_index))
    

    for uri in graph.query(querystring):
      this_uri = uri[0]
      doc = { 
        "uri": this_uri
      }
      pref_labels = []
      labels_orig_lc = []
      print("Getting preferred labels")
      for label in graph.preferredLabel(URIRef(this_uri)):
        pref_labels.append(label[1])
        if label[1].language in ['en','fr','es']:
          labels_orig_lc.append(label[1].lower())
  
      doc.update({"labels": pref_labels})
      doc.update({"labels_orig": pref_labels + labels_orig_lc})
      
      alt_labels = []
      alt_labels_orig_lc = []
      print("Getting alternate labels")
      for label in graph.objects(URIRef(this_uri), SKOS.altLabel):
        alt_labels.append(label)
        if label.language in ['en','fr','es']:
          alt_labels_orig_lc.append(label.lower())

      doc.update({"alt_labels": alt_labels})
      doc.update({"alt_labels_orig": alt_labels + alt_labels_orig_lc})
        
      scope_notes = []
      print("Getting scope notes")
      for sn in graph.objects(URIRef(this_uri), SKOS.scopeNote):
        scope_notes.append(sn)
      
      doc.update({"scope_notes": scope_notes})
      
      payload = json.dumps(doc)
      
      r = requests.put(es + 'thesaurus/terms/' + str(index), data=payload)
      index += 1
Ejemplo n.º 35
0
def test_multiple_value_urls_in_virtual():
    csvw = CSVW(csv_path="tests/value_urls.csv",
                metadata_path="tests/value_urls.csv-metadata.json")
    rdf_contents = csvw.to_rdf(fmt="nt")
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="nt")

    # Test subjects
    all_subjects = list(g.subjects())
    s_amount = NS['amount']
    s_desc = NS['description']
    s_id = NS['id']
    assert s_amount in all_subjects
    assert s_desc in all_subjects
    assert s_id in all_subjects

    # Test descriptions
    p_def = NS['definition']
    assert len(list(g.triples(
        (s_amount, p_def, Literal("the amount paid"))))) == 1
    assert len(
        list(g.triples(
            (s_desc, p_def, Literal("description of the expense"))))) == 1
    assert len(list(g.triples((s_id, p_def, Literal("transaction id"))))) == 1

    # Test each is a element type
    o_element = NS['element']
    assert len(list(g.triples((s_amount, RDF.type, o_element)))) == 1
    assert len(list(g.triples((s_desc, RDF.type, o_element)))) == 1
    assert len(list(g.triples((s_id, RDF.type, o_element)))) == 1

    # Test that range is specified
    r_amount = NS['element/amount-RANGE']
    r_desc = NS['element/description-RANGE']
    r_id = NS['element/id-RANGE']

    assert len(list(g.triples((s_amount, RDFS.range, r_amount)))) == 1
    assert len(list(g.triples((s_desc, RDFS.range, r_desc)))) == 1
    assert len(list(g.triples((s_id, RDFS.range, r_id)))) == 1

    # Range is another subject
    assert r_amount in all_subjects
    assert r_desc in all_subjects
    assert r_id in all_subjects

    # Range is a OWL datatype of specified type
    assert len(list(g.triples((r_amount, OWL.onDatatype, XSD.decimal)))) == 1
    assert len(list(g.triples((r_desc, OWL.onDatatype, XSD.string)))) == 1
    assert len(list(g.triples((r_id, OWL.onDatatype, XSD.integer)))) == 1

    # Check the restrictions for amount
    rest_amount_node = list(g.triples((r_amount, OWL.withRestrictions, None)))
    rest_amount_node = rest_amount_node[0][2]
    assert isinstance(rest_amount_node, BNode)
    assert len(list(g.triples(
        (rest_amount_node, RDF.first, XSD.decimal)))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(list(g.triples(
        (rest_amount_node, RDF.first, XSD.MaxLength)))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_amount_node, RDF.first,
                       Literal(10, datatype=XSD.nonNegativeInteger))))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(list(g.triples(
        (rest_amount_node, RDF.first, XSD.MinLength)))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_amount_node, RDF.first,
                       Literal(1, datatype=XSD.nonNegativeInteger))))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(list(g.triples((rest_amount_node, RDF.first, None)))) == 0
    assert len(list(g.triples((rest_amount_node, RDF.rest, None)))) == 0

    # Check the restrictions for description
    rest_desc_node = list(g.triples((r_desc, OWL.withRestrictions, None)))
    rest_desc_node = rest_desc_node[0][2]
    assert isinstance(rest_desc_node, BNode)
    assert len(list(g.triples((rest_desc_node, RDF.first, XSD.string)))) == 1
    rest_desc_node = next(g.objects(subject=rest_desc_node,
                                    predicate=RDF.rest))
    assert len(list(g.triples(
        (rest_desc_node, RDF.first, XSD.MaxLength)))) == 1
    rest_desc_node = next(g.objects(subject=rest_desc_node,
                                    predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_desc_node, RDF.first,
                       Literal(100, datatype=XSD.nonNegativeInteger))))) == 1
    rest_desc_node = next(g.objects(subject=rest_desc_node,
                                    predicate=RDF.rest))
    assert len(list(g.triples((rest_desc_node, RDF.first, None)))) == 0
    assert len(list(g.triples((rest_desc_node, RDF.rest, None)))) == 0

    # Check the restrictions for id
    rest_id_node = list(g.triples((r_id, OWL.withRestrictions, None)))
    rest_id_node = rest_id_node[0][2]
    assert isinstance(rest_id_node, BNode)
    assert len(list(g.triples((rest_id_node, RDF.first, XSD.integer)))) == 1
    rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest))
    assert len(list(g.triples((rest_id_node, RDF.first, XSD.MinLength)))) == 1
    rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_id_node, RDF.first,
                       Literal(0, datatype=XSD.nonNegativeInteger))))) == 1
    rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest))
    assert len(list(g.triples((rest_id_node, RDF.first, None)))) == 0
    assert len(list(g.triples((rest_id_node, RDF.rest, None)))) == 0

    # Check constant value for each
    const_prop = NS['another-list-value-with-constants']
    for s in [r_amount, r_id, r_desc]:
        constant_node = list(g.triples((r_amount, const_prop, None)))
        constant_node = constant_node[0][2]
        assert isinstance(constant_node, BNode)
        assert len(list(g.triples(
            (constant_node, RDF.first, XSD.Length)))) == 1
        constant_node = next(
            g.objects(subject=constant_node, predicate=RDF.rest))
        assert len(
            list(
                g.triples((constant_node, RDF.first,
                           Literal(1, datatype=XSD.nonNegativeInteger))))) == 1
        constant_node = next(
            g.objects(subject=constant_node, predicate=RDF.rest))
        assert len(list(g.triples((constant_node, RDF.first, None)))) == 0
        assert len(list(g.triples((constant_node, RDF.rest, None)))) == 0

    # Verify that empty valueUrl does not end up in graph or rdf contents
    assert NS['empty-list-predicate1'] not in list(g.objects())
    assert "empty-list-predicate1" not in rdf_contents

    # Verify that empty valueUrl does not end up in graph
    assert NS['empty-list-predicate2'] not in list(g.objects())
    assert "empty-list-predicate2" not in rdf_contents

    # Test total number of lists through rdf:nils in order to verify each list
    # ends up with a nil
    test_num_lists = 3 * 3  # 3 rows and 3 virtual list valued columns
    nil_text = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ."
    assert rdf_contents.count(nil_text) == test_num_lists
Ejemplo n.º 36
0
    def handle(self, **options):
        LOGGER.debug("linking places")
        for place in models.Place.objects.filter(dbpedia__isnull=True):
            if not place.city or not place.state:
                continue

            # formulate a dbpedia place uri
            path = urllib2.quote('%s,_%s' % (_clean(place.city), _clean(place.state)))
            url = URIRef('http://dbpedia.org/resource/%s' % path)

            # attempt to get a graph from it
            graph = ConjunctiveGraph()
            try:
                LOGGER.debug("looking up %s", url)
                graph.load(url)
            except urllib2.HTTPError:
                LOGGER.exception("Error fetching %s", url)

            # if we've got more than 3 assertions extract some stuff from
            # the graph and save back some info to the db, would be nice
            # to have a triple store underneath where we could persist
            # all the facts eh?

            if len(graph) >= 3:
                place.dbpedia = url
                place.latitude = graph.value(url, geo['lat'])
                place.longitude = graph.value(url, geo['long'])
                for object in graph.objects(URIRef(url), owl['sameAs']):
                    if object.startswith('http://sws.geonames.org'):
                        place.geonames = object
                place.save()
                LOGGER.info("found dbpedia resource %s", url)
            else:
                LOGGER.warning("couldn't find dbpedia resource for %s", url)

            reset_queries()
        LOGGER.info("finished looking up places in dbpedia")

        LOGGER.info("dumping place_links.json fixture")

        # so it would be nice to use django.core.serializer here
        # but it serializes everything about the model, including
        # titles that are linked to ... and this could theoretically
        # change over time, so we only preserve the facts that have
        # been harvested from dbpedia, so they can overlay over
        # the places that have been extracted during title load

        json_src = []
        places_qs = models.Place.objects.filter(dbpedia__isnull=False)
        for p in places_qs.order_by('name'):
            json_src.append(
                {
                    'name': p.name,
                    'dbpedia': p.dbpedia,
                    'geonames': p.geonames,
                    'longitude': p.longitude,
                    'latitude': p.latitude,
                }
            )
            reset_queries()
        json.dump(json_src, open('core/fixtures/place_links.json', 'w'), indent=2)
        LOGGER.info("finished dumping place_links.json fixture")
Ejemplo n.º 37
0
class FairMetricData():
    def __init__(self, id):
        self.base = 'https://purl.org/fair-metrics/'
        self.id = URIRef(id)
        self.assertion = URIRef(id+'#assertion')

        # id = id.replace(self.base, '')  # HACK -- remove this line before merging commit
        self.g = ConjunctiveGraph()
        self.g.parse(id, format='trig')

    def getID(self):
        return self.id

    def getShortID(self):
        return self.id.replace(self.base, '')

    def getAuthors(self):
        authors = [o.toPython() for o in self.g.objects(subject=self.assertion, predicate=DCTERMS.author)]
        authors.sort()
        return ' \\\\ '.join(authors)

    def getTitle(self):
        return ', '.join([o.toPython() for o in self.g.objects(subject=self.assertion, predicate=RDFS.comment)])

    def getShortTitle(self):
        return ', '.join([o.toPython() for o in self.g.objects(subject=self.assertion, predicate=DCTERMS.title)])

    def getTopicDescription(self):
        descs = []
        for o in self.g.objects(subject=self.id, predicate=FOAF.primaryTopic):
            # o should be fair:A1.1
            for o2 in fairGraph.objects(subject=o, predicate=DCTERMS.description):
                descs.append(o2.toPython())
        return ' '.join(descs)

    def getTopicTitle(self):
        descs = []
        for o in self.g.objects(subject=self.id, predicate=FOAF.primaryTopic):
            # o should be fair:A1.1
            for o2 in fairGraph.objects(subject=o, predicate=DCTERMS.title):
                descs.append(o2.toPython())
        return ' '.join(descs)

    def getMeasuring(self):
        # return fm:measuring
        return self.getFMPropertyValue(FM.measuring)

    def getRationale(self):
        # return fm:rationale
        return self.getFMPropertyValue(FM.rationale)

    def getRequirements(self):
        # return fm:requirements
        return self.getFMPropertyValue(FM.requirements)

    def getProcedure(self):
        # return fm:procedure
        return self.getFMPropertyValue(FM.procedure)

    def getValidation(self):
        # return fm:validation
        return self.getFMPropertyValue(FM.validation)

    def getRelevance(self):
        # return fm:relevance
        return self.getFMPropertyValue(FM.relevance)

    def getExamples(self):
        # return fm:examples
        return self.getFMPropertyValue(FM.examples)

    def getComments(self):
        # return fm:comments
        return self.getFMPropertyValue(FM.comments)

    def getFMPropertyLabel(self, property):
        return ', '.join([ o.toPython() for o in fairTermGraph.objects(subject=FM[property], predicate=RDFS['label'])])

    def getFMPropertyValue(self, property):
        return ', '.join([o.toPython() for o in self.g.objects(subject=self.assertion, predicate=property)])
Ejemplo n.º 38
0
    primer.add((myNS['pat'], myNS['age'], Literal(24)))

    # Now, with just that, lets see how the system
    # recorded *way* too many details about what
    # you just asserted as fact.
    #

    from pprint import pprint
    pprint(list(primer))

    # just think .whatever((s, p, o))
    # here we report on what we know

    pprint(list(primer.subjects()))
    pprint(list(primer.predicates()))
    pprint(list(primer.objects()))

    # and other things that make sense

    # what do we know about pat?
    pprint(list(primer.predicate_objects(myNS.pat)))

    # who is what age?
    pprint(list(primer.subject_objects(myNS.age)))

    # Okay, so lets now work with a bigger
    # dataset from the example, and start
    # with a fresh new graph.

    primer = ConjunctiveGraph()
Ejemplo n.º 39
0
class ManifestHelper(object):
    def __init__(self, uri=None):
        self.uri = None
        if uri:
            self.uri = uri
        self.reset()
    
    def reset(self):
        self.g = None
        if self.uri:
            self.g = ConjunctiveGraph(identifier=self.uri)
        else:
            self.g = ConjunctiveGraph()
        self.namespaces = {}
        self.urihelper = URIHelper(self.namespaces)
        #add defaults
        for prefix, ns in NAMESPACES.iteritems():
            self.add_namespace(prefix, ns)
    
    def from_string(self, textfile, format="xml", encoding="utf-8"):
        self.reset()
        self.g.parse(textfile, format)
        return
    
    def triple_exists(self, s, p, o):
        if not type(self.g).__name__ in ['ConjunctiveGraph', 'Graph']:
            return False        
        if s == '*':
            s = None
        if p == '*':
            p = None
        if o == '*':
            o = None

        if not isinstance(s, URIRef) and not isinstance(s, BNode) and not s == None:
            s = self.urihelper.get_uriref(s)
        
        if not isinstance(p, URIRef) and not p == None:
            p = self.urihelper.parse_uri(p)

        if not isinstance(o, URIRef) and not isinstance(o, Literal) and not isinstance(o, BNode) and not o == None:
            if not isinstance(o, basestring):
                o = unicode(o)
            o = self.urihelper.parse_uri(o, return_Literal_not_Exception=True)
             
        count = 0
        for ans_s, ans_p, ans_o in self.g.triples((s, p, o)):
            count += 1
        if count > 0:
            return True
        else:
            return False 
    
    def list_objects(self, s, p):
        objects = []
        if not type(self.g).__name__ in ['ConjunctiveGraph', 'Graph']:
            return objects
        if s == '*':
            s = None
        if p == '*':
            p = None

        if not isinstance(s, URIRef) and not isinstance(s, BNode) and not s == None:
            s = self.urihelper.get_uriref(s)
        
        if not isinstance(p, URIRef) and not p == None:
            p = self.urihelper.parse_uri(p)

        for o in self.g.objects(s, p):
            objects.append(o)
        return objects
    
    def add_triple(self, s, p, o):
        if not isinstance(s, URIRef) and not isinstance(s, BNode):
            s = self.urihelper.get_uriref(s)
        
        if not isinstance(p, URIRef):
            p = self.urihelper.parse_uri(p)

        if not isinstance(o, URIRef) and not isinstance(o, Literal) and not isinstance(o, BNode):
            if not isinstance(o, basestring):
                o = unicode(o)
            o = self.urihelper.parse_uri(o, return_Literal_not_Exception=True)

        self.g.add((s, p, o))
        self.g.commit()
        return
    
    def add_namespace(self, prefix, uri):
        if not isinstance (prefix, basestring):
            raise TypeError('Add namespace: prefix is not of type string or unicode') 

        if not isinstance(uri, (URIRef, Namespace)):
            if not isinstance(uri, basestring):
                raise TypeError('Add namespace: namespace is not of type string or unicode') 

        if not isinstance(prefix, unicode):
            prefix = unicode(prefix)

        if isinstance(uri, basestring) and not isinstance(uri, unicode):
            uri = unicode(uri)

        self.namespaces[prefix] = self.urihelper.get_namespace(uri)
        if prefix not in self.urihelper.namespaces:
            self.urihelper.namespaces[prefix] = self.urihelper.get_namespace(uri)
        self.g.bind(prefix, self.namespaces[prefix])
        return
    
    def del_namespace(self, prefix, ns):
        if prefix in self.namespaces:
            del self.namespaces[prefix]
        return
    
    def del_triple(self, s, p, o=None):
        if not type(self.g).__name__ in ['ConjunctiveGraph', 'Graph']:
            return
        if s == '*':
            s = None
        if p == '*':
            p = None
        if o == '*':
            o = None

        if not isinstance(s, URIRef) and not isinstance(s, BNode) and not s == None:
            s = self.urihelper.get_uriref(s)
        
        if not isinstance(p, URIRef) and not p == None:
            p = self.urihelper.parse_uri(p)

        if not isinstance(o, URIRef) and not isinstance(o, Literal) and not isinstance(o, BNode) and not o == None:
            if not isinstance(o, basestring):
                o = unicode(o)
            o = self.urihelper.parse_uri(o, return_Literal_not_Exception=True)
        self.g.remove((s, p, o))
        return
    
    def get_graph(self):
        return self.g
    
    def to_string(self, format="xml"):
        if type(self.g).__name__ in ['ConjunctiveGraph', 'Graph'] and len(self.g)>0:
            self.g.commit()
            ans_str = self.g.serialize(format=format, encoding="utf-8")+"\n"
            return ans_str
        else:
            return u'<?xml version="1.0" encoding="UTF-8"?>\n'
Ejemplo n.º 40
0
class RDFCrawler:

    logger = logging.getLogger(__name__)

    def __init__(self, uri, domains=set()):
        """

        :param uri: root URI to start crawling .
        :param domains: list of permits domains to crawl.
        """
        self.root = uri
        self.graph_route = 'graph_store_%s' % hash(self.root)
        self.graph = ConjunctiveGraph('Sleepycat')
        self.graph.open(self.graph_route, create=True)
        self._filter_domains = domains
        self._filter_domains.add(uri)
        self.last_process_time = 0.0
        self.lock = RLock()

    def filter_uris(self, uri_list):

        """
        :param uri_list: list of URIs to be filtered.
        :return: filtered list of URIs.
        """
        return [uri for uri in uri_list for match in self._filter_domains
                if match in str(uri)]

    def _has_context(self, graph, subject):
        """

        :param subject: the URIRef or URI to check if it has current context.
        :return: True if subject has a current context.
        """
        return len(graph.get_context(self._get_context_id(subject))) > 1

    @staticmethod
    def _get_context_id(subject):
        """

        :param subject: URIRef or URI from which the get context id.
        :return: context id of the resource.
        Example:
            subject -> http://www.example.org/#fragment
            context_id -> http://www.example.org/
        """
        return str(subject).split('#')[0]

    def start(self):
        """
            start method for crawling.
        """
        self.lock.acquire(True)

        # Erase old graph
        for q in self.graph.quads():
            self.graph.remove(q)

        # Crawl for data
        logging.info('Start crawling: %s' % self.root)
        start_time = time.time()
        self._crawl([self.root])
        end_time = time.time()

        self.last_process_time = end_time - start_time
        logging.info('Crawling complete after: %s seconds with %s predicates.'
                     % (self.last_process_time, len(self.graph)))

        self.lock.release()

    def _crawl(self, uri_list):
        """
        Recursive method that crawl RDF objects
        :param uri_list: list of URIs to crawl
        """
        if len(uri_list) > 0:

            for uri in uri_list:
                try:

                    # A few considerations about parsing params.
                    #   publicID = uri due to redirection issues
                    #   Format = None due to default params use 'XML'
                    self.graph.parse(uri, publicID=uri, format=None)
                    logging.info('[OK]: %s' % uri)
                except Exception as e:
                    logging.info('[Error]: %s: %s' % (uri, e))

            # Check that there are context that remains without parsing
            objects = set([self._get_context_id(o)
                           for o in set(self.graph.objects(None, None))
                           if isinstance(o, URIRef) and
                           not self._has_context(self.graph, o)])

            self._crawl(self.filter_uris(objects))
Ejemplo n.º 41
0
class TabLinker(object):
    defaultNamespacePrefix = 'http://example.org/resource/'
    annotationsNamespacePrefix = 'http://example.org/annotation/'
    namespaces = {
      'dcterms':Namespace('http://purl.org/dc/terms/'), 
      'skos':Namespace('http://www.w3.org/2004/02/skos/core#'), 
      'tablink':Namespace('http://example.org/ns#'), 
      'qb':Namespace('http://purl.org/linked-data/cube#'), 
      'owl':Namespace('http://www.w3.org/2002/07/owl#')
    }
    annotationNamespaces = {
      'np':Namespace('http://www.nanopub.org/nschema#'),
      'oa':Namespace('http://www.w3.org/ns/openannotation/core/'),
      'xsd':Namespace('http://www.w3.org/2001/XMLSchema#'),
      'dct':Namespace('http://purl.org/dc/terms/')
    }

    def __init__(self, filename, config, level = logging.DEBUG):
        """TabLinker constructor
        
        Keyword arguments:
        filename -- String containing the name of the current Excel file being examined
        config -- Configuration object, loaded from .ini file
        level -- A logging level as defined in the logging module
        """
        self.config = config
        self.filename = filename
         
        self.log = logging.getLogger("TabLinker")
        self.log.setLevel(level)
        
        self.log.debug('Initializing Graphs')
        self.initGraphs()
        
        self.log.debug('Setting Scope')
        basename = os.path.basename(filename)
        basename = re.search('(.*)\.xls',basename).group(1)
        self.setScope(basename)
        
        self.log.debug('Loading Excel file {0}.'.format(filename))
        self.rb = open_workbook(filename, formatting_info=True)
        
        self.log.debug('Reading styles')
        self.styles = Styles(self.rb)
        
        self.log.debug('Copied Workbook to writable copy')
        self.wb = copy(self.rb)
        
        
    def initGraphs(self):
        """
        Initialize the graphs, set default namespaces, and add schema information
        """
    
        self.graph = ConjunctiveGraph()
        # Create a separate graph for annotations
        self.annotationGraph = ConjunctiveGraph()
        
        self.log.debug('Adding namespaces to graphs')
        # Bind namespaces to graphs
        for namespace in self.namespaces:
            self.graph.namespace_manager.bind(namespace, self.namespaces[namespace])

        # Same for annotation graph
        for namespace in self.annotationNamespaces:
            self.annotationGraph.namespace_manager.bind(namespace, self.annotationNamespaces[namespace])
        
        # Add schema information
        self.log.debug('Adding some schema information (dimension and measure properties) ')
        self.addDataCellProperty()

        # Add dimensions                    
        self.graph.add((self.namespaces['tablink']['dimension'], RDF.type, self.namespaces['qb']['DimensionProperty']))
        
        #self.graph.add((self.namespaces['tablink']['label'], RDF.type, RDF['Property']))
    
    def addDataCellProperty(self):
        """
        Add definition of data cell resource to graph
        """

        if len(self.config.get('dataCell', 'propertyName')) > 0 :
            self.dataCellPropertyName = self.config.get('dataCell', 'propertyName')
        else :
            self.dataCellPropertyName = 'hasValue'
        
        self.graph.add((self.namespaces['tablink'][self.dataCellPropertyName], RDF.type, self.namespaces['qb']['MeasureProperty']))
        
        #Take labels from config
        if len(self.config.get('dataCell', 'labels')) > 0 :
            labels = self.config.get('dataCell', 'labels').split(':::')
            for label in labels :
                labelProperties = label.split('-->')
                if len(labelProperties[0]) > 0 and len(labelProperties[1]) > 0 :
                    self.graph.add((self.namespaces['tablink'][self.dataCellPropertyName], RDFS.label, Literal(labelProperties[1],labelProperties[0])))
                    
        if len(self.config.get('dataCell', 'literalType')) > 0 :
            self.graph.add((self.namespaces['tablink'][self.dataCellPropertyName], RDFS.range, URIRef(self.config.get('dataCell', 'literalType'))))
            
    def setScope(self, fileBasename):
        """Set the default namespace and base for all URIs of the current workbook"""
        self.fileBasename = fileBasename
        scopeNamespace = self.defaultNamespacePrefix + fileBasename + '/'
        
        # Annotations go to a different namespace
        annotationScopeNamespace = self.annotationsNamespacePrefix + fileBasename + '/'
        
        self.log.debug('Adding namespace for {0}: {1}'.format(fileBasename, scopeNamespace))
        
        self.namespaces['scope'] = Namespace(scopeNamespace)
        self.annotationNamespaces['scope'] = Namespace(annotationScopeNamespace)
        self.graph.namespace_manager.bind('', self.namespaces['scope'])
        self.annotationGraph.namespace_manager.bind('', self.annotationNamespaces['scope'])
        
    def doLink(self):
        """Start tablinker for all sheets in workbook"""
        self.log.info('Starting TabLinker for all sheets in workbook')
        
        for n in range(self.rb.nsheets) :
            self.log.info('Starting with sheet {0}'.format(n))
            self.r_sheet = self.rb.sheet_by_index(n)
            self.w_sheet = self.wb.get_sheet(n)
            
            self.rowns, self.colns = self.getValidRowsCols()
                 
            self.sheet_qname = urllib.quote(re.sub('\s','_',self.r_sheet.name))
            self.log.info('Base for QName generator set to: {0}'.format(self.sheet_qname))
            
            self.log.debug('Starting parser')
            self.parseSheet()
    
    ###
    #    Utility Functions
    ### 
    
    def insideMergeBox(self, i, j):
        """
        Check if the specified cell is inside a merge box

        Arguments:
        i -- row
        j -- column

        Returns:
        True/False -- depending on whether the cell is inside a merge box
        """
        self.merged_cells = self.r_sheet.merged_cells
        for crange in self.merged_cells:
            rlo, rhi, clo, chi = crange
            if i <=  rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo:
                return True
        return False
        

    def getMergeBoxCoord(self, i, j):
        """
        Get the top-left corner cell of the merge box containing the specified cell

        Arguments:
        i -- row
        j -- column

        Returns:
        (k, l) -- Coordinates of the top-left corner of the merge box
        """
        if not self.insideMergeBox(i,j):
            return (-1, -1)

        self.merged_cells = self.r_sheet.merged_cells
        for crange in self.merged_cells:
            rlo, rhi, clo, chi = crange
            if i <=  rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo:
                return (rlo, clo)            
         
    def getType(self, style):
        """Get type for a given excel style. Style name must be prefixed by 'TL '
    
        Arguments:
        style -- Style (string) to check type for
        
        Returns:
        String -- The type of this field. In case none is found, 'unknown'
        """
        typematch = re.search('TL\s(.*)',style)
        if typematch :
            cellType = typematch.group(1)
        else :
            cellType = 'Unknown'
        return cellType
    
    def isEmpty(self, i,j):
        """Check whether cell is empty.
        
        Arguments:
        i -- row
        j -- column
        
        Returns:
        True/False -- depending on whether the cell is empty
        """
        if (self.r_sheet.cell(i,j).ctype == XL_CELL_EMPTY or self.r_sheet.cell(i,j).ctype == XL_CELL_BLANK) or self.r_sheet.cell(i,j).value == '' :
            return True
        else :
            return False
        
    def isEmptyRow(self, i, colns):
        """
        Determine whether the row 'i' is empty by iterating over all its cells
        
        Arguments:
        i     -- The index of the row to be checked.
        colns -- The number of columns to be checked
        
        Returns:
        true  -- if the row is empty
        false -- if the row is not empty
        """
        for j in range(0,colns) :
            if not self.isEmpty(i,j):
                return False
        return True
    
    def isEmptyColumn(self, j, rowns ):
        """
        Determine whether the column 'j' is empty by iterating over all its cells
        
        Arguments:
        j     -- The index of the column to be checked.
        rowns -- The number of rows to be checked
        
        Returns:
        true  -- if the column is empty
        false -- if the column is not empty
        """
        for i in range(0,rowns) :
            if not self.isEmpty(i,j):
                return False
        return True
    
    def getValidRowsCols(self) :
        """
        Determine the number of non-empty rows and columns in the Excel sheet
        
        Returns:
        rowns -- number of rows
        colns -- number of columns
        """
        colns = number_of_good_cols(self.r_sheet)
        rowns = number_of_good_rows(self.r_sheet)
        
        # Check whether the number of good columns and rows are correct
        while self.isEmptyRow(rowns-1, colns) :
            rowns = rowns - 1 
        while self.isEmptyColumn(colns-1, rowns) :
            colns = colns - 1
            
        self.log.debug('Number of rows with content:    {0}'.format(rowns))
        self.log.debug('Number of columns with content: {0}'.format(colns))
        return rowns, colns
    
    def getQName(self, names):
        """
        Create a valid QName from a string or dictionary of names
        
        Arguments:
        names -- Either dictionary of names or string of a name.
        
        Returns:
        qname -- a valid QName for the dictionary or string
        """
        
        if type(names) == dict :
            qname = self.sheet_qname
            for k in names :
                qname = qname + '_' + self.processString(names[k])
        else :
            qname = self.sheet_qname + '_' + self.processString(names)
        
        self.log.debug('Minted new QName: {}'.format(qname))
        return qname

    def getColHeaderLabel(self, colheaders):
        label = '_'.join(colheaders)
        return label
        
    def getColHeaderValueURI(self, colheaders):
        label = self.getColHeaderLabel(colheaders)
        uri = self.namespaces['scope'][self.processString(label)]
        return uri
        
    def getColHeaderPropertyURI(self, index):
        uri = self.namespaces['scope']['HColHeader' + str(index)]
        return uri
    
    def processString(self, string):
        """
        Remove illegal characters (comma, brackets, etc) from string, and replace it with underscore. Useful for URIs
        
        Arguments:
        string -- The string representing the value of the source cell
        
        Returns:
        processedString -- The processed string
        """
        # TODO accents too
        return urllib.quote(re.sub('\s|\(|\)|,|\.','_',unicode(string).strip().replace('/', '-')).encode('utf-8', 'ignore'))

            
    def addValue(self, source_cell_value, altLabel=None):
        """
        Add a "value" + optional label to the graph for a cell in the source Excel sheet. The value is typically the value stored in the source cell itself, but may also be a copy of another cell (e.g. in the case of 'idem.').
        
        Arguments:
        source_cell_value -- The string representing the value of the source cell
        
        Returns:
        source_cell_value_qname -- a valid QName for the value of the source cell
        """
        source_cell_value_qname = self.getQName(source_cell_value)
        #self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['qb']['dataSet'],self.namespaces['scope'][self.sheet_qname]))
        
        #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['value'],self.namespaces['scope'][source_cell_value_qname]))
        
        # If the source_cell_value is actually a dictionary (e.g. in the case of HierarchicalRowHeaders), then use the last element of the row hierarchy as prefLabel
        # Otherwise just use the source_cell_value as prefLabel
        if type(source_cell_value) == dict :
            self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].prefLabel,Literal(source_cell_value.values()[-1],'nl')))
            
            if altLabel and altLabel != source_cell_value.values()[-1]:
                # If altLabel has a value (typically for HierarchicalRowHeaders) different from the last element in the row hierarchy, we add it as alternative label. 
                self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].altLabel,Literal(altLabel,'nl')))
        else :
            self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].prefLabel,Literal(source_cell_value,'nl')))
            
            if altLabel and altLabel != source_cell_value:
                # If altLabel has a value (typically for HierarchicalRowHeaders) different from the source_cell_value, we add it as alternative label. 
                self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].altLabel,Literal(altLabel,'nl')))
        
        return source_cell_value_qname
    
    def parseSheet(self):
        """
        Parses the currently selected sheet in the workbook, takes no arguments. Iterates over all cells in the Excel sheet and produces relevant RDF Triples. 
        """
        self.log.info("Parsing {0} rows and {1} columns.".format(self.rowns,self.colns))
        
        self.column_dimensions = {}
        self.property_dimensions = {}
        self.row_dimensions = {}
        self.rowhierarchy = {}

        # Get dictionary of annotations
        self.annotations = self.r_sheet.cell_note_map
        
        for i in range(0,self.rowns):
            self.rowhierarchy[i] = {}
            
            for j in range(0, self.colns):
                # Parse cell data
                self.source_cell = self.r_sheet.cell(i,j)
                self.source_cell_name = cellname(i,j)
                self.style = self.styles[self.source_cell].name
                self.cellType = self.getType(self.style)
                self.source_cell_qname = self.getQName(self.source_cell_name)
                
                self.log.debug("({},{}) {}/{}: \"{}\"". format(i,j,self.cellType, self.source_cell_name, self.source_cell.value))

                # Try to parse ints to avoid ugly _0 URIs
                try:
                    if int(self.source_cell.value) == self.source_cell.value:
                        self.source_cell.value = int(self.source_cell.value)
                except ValueError:
                    self.log.debug("(%s.%s) No parseable int" % (i,j))

                                            
                # Parse annotation (if any)
                if self.config.get('annotations', 'enabled') == "1":
                    if (i,j) in self.annotations:
                        self.parseAnnotation(i, j)

                # Parse cell even if empty
                if self.cellType == 'Data':
                    self.parseData(i, j)
                elif (self.cellType == 'HRowHeader') :
                    self.updateRowHierarchy(i, j)
                elif self.cellType == 'ColHeader' :
                    self.parseColHeader(i, j)
                elif self.cellType == 'RowProperty' :
                    self.parseRowProperty(i, j)
                
                # If cell not empty, check for more types
                if not self.isEmpty(i,j) :
                    #self.graph.add((self.namespaces['scope'][self.source_cell_qname],RDF.type,self.namespaces['tablink'][self.cellType]))
                    #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['cell'],Literal(self.source_cell_name)))
                    #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['col'],Literal(colname(j))))
                    #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['row'],Literal(i+1)))
                    #self.graph.add((self.namespaces['scope'][self.source_cell_qname] isrow row
                    if self.cellType == 'Title' :
                        self.parseTitle(i, j)
    
                    elif self.cellType == 'RowHeader' :
                        self.parseRowHeader(i, j)
                    
                    elif self.cellType == 'HRowHeader' :
                        self.parseHierarchicalRowHeader(i, j)
                         
                    elif self.cellType == 'RowLabel' :
                        self.parseRowLabel(i, j)
        
        # Add additional information about the hierarchy of column headers
        for value in self.column_dimensions.values():
            for index in range(1, len(value)):
                uri_sub = self.getColHeaderValueURI(value[:index+1])
                uri_top = self.getColHeaderValueURI(value[:index])
                self.graph.add((uri_sub, self.namespaces['tablink']['subColHeaderOf'], uri_top))
                self.graph.add((uri_sub, self.namespaces['tablink']['depth'], Literal(index)))
                self.graph.add((uri_top, self.namespaces['tablink']['depth'], Literal(index-1)))
        
        self.log.info("Done parsing...")

    def updateRowHierarchy(self, i, j) :
        """
        Build up lists for hierarchical row headers. Cells marked as hierarchical row header are often empty meaning that their intended value is stored somewhere else in the Excel sheet.
        
        Keyword arguments:
        int i -- row number
        int j -- col number
        
        Returns:
        New row hierarchy dictionary
        """
        if (self.isEmpty(i,j) or str(self.source_cell.value).lower().strip() == 'id.') :
            # If the cell is empty, and a HierarchicalRowHeader, add the value of the row header above it.
            # If the cell above is not in the rowhierarchy, don't do anything.
            # If the cell is exactly 'id.', add the value of the row header above it. 
            try :
                self.rowhierarchy[i][j] = self.rowhierarchy[i-1][j]
                self.log.debug("({},{}) Copied from above\nRow hierarchy: {}".format(i,j,self.rowhierarchy[i]))
            except :
                # REMOVED because of double slashes in uris
                # self.rowhierarchy[i][j] = self.source_cell.value
                self.log.debug("({},{}) Top row, added nothing\nRow hierarchy: {}".format(i,j,self.rowhierarchy[i]))
        elif str(self.source_cell.value).lower().startswith('id.') or str(self.source_cell.value).lower().startswith('id '):
            # If the cell starts with 'id.', add the value of the row  above it, and append the rest of the cell's value.
            suffix = self.source_cell.value[3:]               
            try :       
                self.rowhierarchy[i][j] = self.rowhierarchy[i-1][j]+suffix
                self.log.debug("({},{}) Copied from above+suffix\nRow hierarchy {}".format(i,j,self.rowhierarchy[i]))
            except :
                self.rowhierarchy[i][j] = self.source_cell.value
                self.log.debug("({},{}) Top row, added value\nRow hierarchy {}".format(i,j,self.rowhierarchy[i]))
        elif not self.isEmpty(i,j) :
            self.rowhierarchy[i][j] = self.source_cell.value
            self.log.debug("({},{}) Added value\nRow hierarchy {}".format(i,j,self.rowhierarchy[i]))
        return self.rowhierarchy
    
    def parseHierarchicalRowHeader(self, i, j) :
        """
        Create relevant triples for the cell marked as HierarchicalRowHeader (i, j are row and column)
        """
        
        # Use the rowhierarchy to create a unique qname for the cell's contents, 
        # give the source_cell's original value as extra argument
        self.log.debug("Parsing HierarchicalRowHeader")
            
        # Add all the values
        for (index, value) in self.rowhierarchy[i].items():
            prop = self.property_dimensions[index]
            self.row_dimensions.setdefault(i,{})
            self.row_dimensions[i][self.namespaces['scope'][prop]]= Literal(value)
            
        # Relate the hierarchical headers
        keys = self.rowhierarchy[i].keys()
        for i in range(len(keys)-1):
            prop_top = self.namespaces['scope'][self.property_dimensions[keys[i]]]
            prop_sub = self.namespaces['scope'][self.property_dimensions[keys[i+1]]]
            self.graph.add((prop_sub, self.namespaces['tablink']['subPropertyOf'], prop_top))
        

    def parseRowLabel(self, i, j):
        """
        Create relevant triples for the cell marked as Label (i, j are row and column)
        """  
        
        self.log.debug("Parsing Row Label")
        
        # Get the QName of the HierarchicalRowHeader cell that this label belongs to, based on the rowhierarchy for this row (i)
        hierarchicalRowHeader_value_qname = self.getQName(self.rowhierarchy[i])
        
        prefLabels = self.graph.objects(self.namespaces['scope'][hierarchicalRowHeader_value_qname], self.namespaces['skos'].prefLabel)
        for label in prefLabels :
            # If the hierarchicalRowHeader QName already has a preferred label, turn it into a skos:altLabel
            self.graph.remove((self.namespaces['scope'][hierarchicalRowHeader_value_qname],self.namespaces['skos'].prefLabel,label))
            self.graph.add((self.namespaces['scope'][hierarchicalRowHeader_value_qname],self.namespaces['skos'].altLabel,label))
            self.log.debug("Turned skos:prefLabel {} for {} into a skos:altLabel".format(label, hierarchicalRowHeader_value_qname))
        
        # Add the value of the label cell as skos:prefLabel to the header cell
        # self.graph.add((self.namespaces['scope'][hierarchicalRowHeader_value_qname], self.namespaces['skos'].prefLabel, Literal(self.source_cell.value, 'nl')))
            
        # Record that this source_cell_qname is the label for the HierarchicalRowHeader cell
        # self.graph.add((self.namespaces['scope'][self.source_cell_qname], self.namespaces['tablink']['isLabel'], self.namespaces['scope'][hierarchicalRowHeader_value_qname]))
    
    def parseRowHeader(self, i, j) :
        """
        Create relevant triples for the cell marked as RowHeader (i, j are row and column)
        """
        rowHeaderValue = ""

        # Don't attach the cell value to the namespace if it's already a URI
        isURI = urlparse(str(self.source_cell.value))
        if isURI.scheme and isURI.netloc:
            rowHeaderValue = URIRef(self.source_cell.value)
        else:
            self.source_cell_value_qname = self.source_cell.value
            rowHeaderValue = Literal(self.source_cell_value_qname)
        
        # Get the properties to use for the row headers
        prop = self.property_dimensions[j]
        self.row_dimensions.setdefault(i,{})
        self.row_dimensions[i][self.namespaces['scope'][prop]]= rowHeaderValue
        
        return
    
    def parseColHeader(self, i, j) :
        """
        Create relevant triples for the cell marked as Header (i, j are row and column)
        """
        cell_content = self.processString(self.source_cell.value)
        if self.isEmpty(i,j):
            if self.insideMergeBox(i,j):
                k, l = self.getMergeBoxCoord(i,j)
                
                # If we are in a vertical merge box, skip adding the dimension
                if l == j:
                    return

                # Update cell content        
                cell_content = self.processString(self.r_sheet.cell(k,l).value)
            else:
                return

        # Add the value qname to the column_dimensions list for that column
        self.column_dimensions.setdefault(j,[self.sheet_qname]).append(cell_content)
        
        # Add the data to the graph
        resource = self.getColHeaderValueURI(self.column_dimensions[j])
        self.graph.add((resource, RDF.type, self.namespaces['tablink']['ColumnHeader']))
        self.graph.add((resource, self.namespaces['skos']['prefLabel'], Literal(cell_content)))
        self.graph.add((resource, self.namespaces['tablink']['cell'], Literal(self.source_cell_name)))
        return
    
    def parseRowProperty(self, i, j) :
        """
        Create relevant triples for the cell marked as Property (i, j are row and column)
        """
        if self.isEmpty(i,j):
            if self.insideMergeBox(i,j):
                k, l = self.getMergeBoxCoord(i,j)
                self.source_cell_value_qname = self.addValue(self.r_sheet.cell(k,l).value)
            else:
                return
        else:
            self.source_cell_value_qname = self.addValue(self.source_cell.value)   
        #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['isDimensionProperty'],self.namespaces['scope'][self.source_cell_value_qname]))
        #self.graph.add((self.namespaces['scope'][self.source_cell_value_qname],RDF.type,self.namespaces['qb']['DimensionProperty']))
        #self.graph.add((self.namespaces['scope'][self.source_cell_value_qname],RDF.type,RDF['Property']))
        
        #self.property_dimensions.setdefault(j,[]).append(self.source_cell_value_qname)
        self.property_dimensions[j] = self.source_cell_value_qname
        
        # Add to graph
        resource = self.namespaces['scope'][self.property_dimensions[j]]
        self.graph.add((resource, RDF.type, self.namespaces['tablink']['RowProperty']))

        return
    
    def parseTitle(self, i, j) :
        """
        Create relevant triples for the cell marked as Title (i, j are row and column)
        """
        self.graph.add((self.namespaces['scope'][self.sheet_qname], 
                        self.namespaces['tablink']['title'], 
                        Literal(self.source_cell.value)))        
        return
        
        
    def parseData(self, i,j) :
        """
        Create relevant triples for the cell marked as Data (i, j are row and column)
        """
        
        if self.isEmpty(i,j) and self.config.get('dataCell', 'implicitZeros') == '0':
            return

        # Use the fully qualified name of the cell for the resource name
        observation = self.namespaces['scope'][self.source_cell_qname]
        
        # It's an observation
        self.graph.add((observation,
                        RDF.type,
                        self.namespaces['qb']['Observation']))
        
        # It's in the data set defined by the current sheet
        self.graph.add((observation,
                        self.namespaces['qb']['dataSet'],
                        self.namespaces['scope'][self.sheet_qname]))
        
        # Add it's value
        # TODO type the value
        if self.isEmpty(i,j) and self.config.get('dataCell', 'implicitZeros') == '1':
            self.graph.add((observation,
                            self.namespaces['scope'][self.dataCellPropertyName],
                            Literal(0)))
        else:
            self.graph.add((observation,
                            self.namespaces['scope'][self.dataCellPropertyName],
                            Literal(self.source_cell.value)))
        
        # Use the row dimensions dictionary to find the properties that link
        # data values to row headers
        try :
            for (prop, value) in self.row_dimensions[i].iteritems() :
                self.graph.add((observation, prop, value))
        except KeyError :
            self.log.debug("({}.{}) No row dimension for cell".format(i,j))
        
        # Use the column dimensions dictionary to find the objects of the 
        # d2s:dimension property
        self.graph.add((observation,
                        self.namespaces['tablink']['dimension'],
                        self.getColHeaderValueURI(self.column_dimensions[j])))

    def parseAnnotation(self, i, j) :
        """
        Create relevant triples for the annotation attached to cell (i, j)
        """

        if self.config.get('annotations', 'model') == 'oa':
            # Create triples according to Open Annotation model

            body = BNode()

            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      RDF.type, 
                                      self.annotationNamespaces['oa']['Annotation']
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['hasBody'], 
                                      body
                                      ))
            self.annotationGraph.add((body,
                                      RDF.value, 
                                      Literal(self.annotations[(i,j)].text.replace("\n", " ").replace("\r", " ").replace("\r\n", " ").encode('utf-8'))
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['hasTarget'], 
                                      self.namespaces['scope'][self.source_cell_qname]
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['annotator'], 
                                      Literal(self.annotations[(i,j)].author.encode('utf-8'))
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['annotated'], 
                                      Literal(datetime.datetime.fromtimestamp(os.path.getmtime(self.filename)).strftime("%Y-%m-%d"),datatype=self.annotationNamespaces['xsd']['date'])
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['generator'], 
                                      URIRef("https://github.com/Data2Semantics/TabLinker")
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['generated'], 
                                      Literal(datetime.datetime.now().strftime("%Y-%m-%d"), datatype=self.annotationNamespaces['xsd']['date'])
                                      ))
            self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], 
                                      self.annotationNamespaces['oa']['modelVersion'], 
                                      URIRef("http://www.openannotation.org/spec/core/20120509.html")
                                      ))
        else:
            # Create triples according to Nanopublications model
            print "Nanopublications not implemented yet!"
Ejemplo n.º 42
0
class CondorFastqExtract(object):
    def __init__(self, host, sequences_path,
                 log_path='log',
                 model=None,
                 compression=None,
                 force=False):
        """Extract fastqs from results archive

        Args:
          host (str): root of the htsworkflow api server
          apidata (dict): id & key to post to the server
          sequences_path (str): root of the directory tree to scan for files
          log_path (str): where to put condor log files
          compression (str): one of 'gzip', 'bzip2'
          force (bool): do we force overwriting current files?
        """
        self.host = host
        if model is None:
            self.model = ConjunctiveGraph()
        else:
            self.model = model
        self.sequences_path = sequences_path
        self.log_path = log_path
        self.compression=compression
        self.force = force
        LOGGER.info("CondorFastq host={0}".format(self.host))
        LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path))
        LOGGER.info("CondorFastq log_path={0}".format(self.log_path))
        LOGGER.info("Compression {0}".format(self.compression))

    def create_scripts(self, result_map ):
        """
        Generate condor scripts to build any needed fastq files

        Args:
          result_map: htsworkflow.submission.results.ResultMap()
        """
        template_map = {'srf': 'srf.condor',
                        'qseq': 'qseq.condor',
                        'split_fastq': 'split_fastq.condor',
                        }

        env = None
        pythonpath = os.environ.get('PYTHONPATH', None)
        if pythonpath is not None:
            env = "PYTHONPATH=%s" % (pythonpath,)
        condor_entries = self.build_condor_arguments(result_map)
        for script_type in template_map.keys():
            template = loader.get_template(template_map[script_type])
            context = {'python': sys.executable,
                       'logdir': self.log_path,
                       'env': env,
                       'args': condor_entries[script_type],
                       'root_url': self.host,
            }

            with open(script_type + '.condor','w+') as outstream:
                outstream.write(template.render(context))

    def build_condor_arguments(self, result_map):
        condor_entries = {'srf': [],
                          'qseq': [],
                          'split_fastq': []}

        conversion_funcs = {'srf': self.condor_srf_to_fastq,
                            'qseq': self.condor_qseq_to_fastq,
                            'split_fastq': self.condor_desplit_fastq
                            }
        sequences = self.find_archive_sequence_files(result_map)
        needed_targets = self.update_fastq_targets(result_map, sequences)

        for target_pathname, available_sources in needed_targets.items():
            LOGGER.debug(' target : %s' % (target_pathname,))
            LOGGER.debug(' candidate sources: %s' % (available_sources,))
            for condor_type in available_sources.keys():
                conversion = conversion_funcs.get(condor_type, None)
                if conversion is None:
                    errmsg = "Unrecognized type: {0} for {1}"
                    LOGGER.error(errmsg.format(condor_type,
                                        pformat(available_sources)))
                    continue
                sources = available_sources.get(condor_type, None)

                if sources is not None:
                    condor_entries.setdefault(condor_type, []).append(
                        conversion(sources, target_pathname))
            else:
                LOGGER.warning(" need file %s", target_pathname)

        return condor_entries

    def find_archive_sequence_files(self,  result_map):
        """
        Find archived sequence files associated with our results.
        """
        self.import_libraries(result_map)
        flowcell_ids = self.find_relevant_flowcell_ids()
        self.import_sequences(flowcell_ids)

        query_text = """
        prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix xsd: <http://www.w3.org/2001/XMLSchema#>

        select ?filenode ?filetype ?cycle ?lane_number ?read
               ?library  ?library_id
               ?flowcell ?flowcell_id ?read_length
               ?flowcell_type ?flowcell_status
        where {
            ?filenode libns:cycle ?cycle ;
                      libns:lane_number ?lane_number ;
                      libns:read ?read ;
                      libns:flowcell ?flowcell ;
                      libns:flowcell_id ?flowcell_id ;
                      libns:library ?library ;
                      libns:library_id ?library_id ;
                      libns:file_type ?filetype ;
                      a libns:IlluminaResult .
            ?flowcell libns:read_length ?read_length ;
                      libns:flowcell_type ?flowcell_type .
            OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status }
            FILTER(?filetype != libns:sequencer_result)
        }
        """
        LOGGER.debug("find_archive_sequence_files query: %s",
                     query_text)
        results = []
        for r in self.model.query(query_text):
            library_id = r['library_id'].toPython()
            if library_id in result_map:
                seq = SequenceResult(r)
                LOGGER.debug("Creating sequence result for library %s: %s",
                             library_id,
                             repr(seq))
                results.append(seq)
        return results

    def import_libraries(self, result_map):
        for lib_id in result_map.keys():
            liburl = urljoin(self.host, 'library/%s/' % (lib_id,))
            library = URIRef(liburl)
            self.import_library(library)

    def import_library(self, library):
        """Import library data into our model if we don't have it already
        """
        q = (library, RDF['type'], libraryOntology['Library'])
        present = False
        if q not in self.model:
            present = True
            self.model.parse(source=library, format='rdfa')
        LOGGER.debug("Did we import %s: %s", str(library), present)

    def find_relevant_flowcell_ids(self):
        """Generate set of flowcell ids that had samples of interest on them
        """
        flowcell_query = """
prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>

select distinct ?flowcell ?flowcell_id
WHERE {
  ?library a libns:Library ;
           libns:has_lane ?lane .
  ?lane libns:flowcell ?flowcell .
  ?flowcell libns:flowcell_id ?flowcell_id .
}"""
        flowcell_ids = set()
        for r in self.model.query(flowcell_query):
            flowcell_ids.add(r['flowcell_id'].toPython())
            imported = False
            a_lane = list(self.model.objects(r['flowcell'],
                                             libraryOntology['has_lane']))
            if len(a_lane) == 0:
                imported = True
                # we lack information about which lanes were on this flowcell
                self.model.parse(r['flowcell'], format='rdfa')
            LOGGER.debug("Did we imported %s: %s" % (str(r['flowcell']),
                                                     imported))

        return flowcell_ids

    def import_sequences(self, flowcell_ids):
        seq_dirs = []
        for f in flowcell_ids:
            seq_dirs.append(os.path.join(self.sequences_path, str(f)))
        sequences = scan_for_sequences(seq_dirs)
        for seq in sequences:
            seq.save_to_model(self.model, self.host)
        update_model_sequence_library(self.model, self.host)

    def update_fastq_targets(self, result_map, raw_files):
        """Return list of fastq files that need to be built.

        Also update model with link between illumina result files
        and our target fastq file.
        """
        # find what targets we're missing
        needed_targets = {}
        for seq in raw_files:
            if not seq.isgood:
                continue
            filename_attributes = {
                'flowcell': seq.flowcell_id,
                'lib_id': seq.library_id,
                'lane': seq.lane_number,
                'read': seq.read,
                'cycle': seq.cycle,
                'compression_extension': COMPRESSION_EXTENSIONS[self.compression],
                'is_paired': seq.ispaired
            }

            fqName = FastqName(**filename_attributes)

            result_dir = result_map[seq.library_id]
            target_pathname = os.path.join(result_dir, fqName.filename)
            if self.force or not os.path.exists(target_pathname):
                t = needed_targets.setdefault(target_pathname, {})
                t.setdefault(seq.filetype, []).append(seq)
            self.add_target_source_links(target_pathname, seq)
        return needed_targets

    def add_target_source_links(self, target, seq):
        """Add link between target pathname and the 'lane' that produced it
        (note lane objects are now post demultiplexing.)
        """
        target_uri = 'file://' + smart_str(target)
        target_node = URIRef(target_uri)
        source_stmt = (target_node, DC['source'], seq.filenode)
        self.model.add(source_stmt)

    def condor_srf_to_fastq(self, sources, target_pathname):
        if len(sources) > 1:
            raise ValueError("srf to fastq can only handle one file")

        mid_point = None
        if sources[0].flowcell_id == '30DY0AAXX':
            mid_point = 76

        return {
            'sources': [sources[0].path],
            'pyscript': srf2fastq.__file__,
            'flowcell': sources[0].flowcell_id,
            'ispaired': sources[0].ispaired,
            'target': target_pathname,
            'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'),
            'mid': mid_point,
            'force': self.force,
        }

    def condor_qseq_to_fastq(self, sources, target_pathname):
        paths = []
        for source in sources:
            paths.append(source.path)
        paths.sort()
        compression_argument = self.format_compression_flag()

        return {
            'pyscript': qseq2fastq.__file__,
            'flowcell': sources[0].flowcell_id,
            'target': target_pathname,
            'compression': compression_argument,
            'sources': paths,
            'ispaired': sources[0].ispaired,
            'istar': len(sources) == 1,
        }

    def condor_desplit_fastq(self, sources, target_pathname):
        paths = []
        for source in sources:
            paths.append(source.path)
        paths.sort()
        compression_argument = self.format_compression_flag()

        return {
            'pyscript': desplit_fastq.__file__,
            'target': target_pathname,
            'compression': compression_argument,
            'sources': paths,
            'ispaired': sources[0].ispaired,
        }

    def format_compression_flag(self):
        return '--'+self.compression if self.compression else ''
Ejemplo n.º 43
0
class TabLinker(object):
    defaultNamespacePrefix = "http://lod.cedar-project.nl/resource/"
    annotationsNamespacePrefix = "http://lod.cedar-project.nl/annotations/"
    namespaces = {
        "dcterms": Namespace("http://purl.org/dc/terms/"),
        "skos": Namespace("http://www.w3.org/2004/02/skos/core#"),
        "d2s": Namespace("http://lod.cedar-project.nl/core/"),
        "qb": Namespace("http://purl.org/linked-data/cube#"),
        "owl": Namespace("http://www.w3.org/2002/07/owl#"),
    }
    annotationNamespaces = {
        "np": Namespace("http://www.nanopub.org/nschema#"),
        "oa": Namespace("http://www.w3.org/ns/openannotation/core/"),
        "xsd": Namespace("http://www.w3.org/2001/XMLSchema#"),
        "dct": Namespace("http://purl.org/dc/terms/"),
    }

    def __init__(self, filename, config, level=logging.DEBUG):
        """TabLinker constructor
        
        Keyword arguments:
        filename -- String containing the name of the current Excel file being examined
        config -- Configuration object, loaded from .ini file
        level -- A logging level as defined in the logging module
        """
        self.config = config
        self.filename = filename

        self.log = logging.getLogger("TabLinker")
        self.log.setLevel(level)

        self.log.debug("Initializing Graphs")
        self.initGraphs()

        self.log.debug("Setting Scope")
        basename = os.path.basename(filename)
        basename = re.search("(.*)\.xls", basename).group(1)
        self.setScope(basename)

        self.log.debug("Loading Excel file {0}.".format(filename))
        self.rb = open_workbook(filename, formatting_info=True)

        self.log.debug("Reading styles")
        self.styles = Styles(self.rb)

        self.log.debug("Copied Workbook to writable copy")
        self.wb = copy(self.rb)

    def initGraphs(self):
        """Initialize the graphs, set default namespaces, and add schema information"""

        self.graph = ConjunctiveGraph()
        # Create a separate graph for annotations
        self.annotationGraph = ConjunctiveGraph()

        self.log.debug("Adding namespaces to graphs")
        # Bind namespaces to graphs
        for namespace in self.namespaces:
            self.graph.namespace_manager.bind(namespace, self.namespaces[namespace])

        # Same for annotation graph
        for namespace in self.annotationNamespaces:
            self.annotationGraph.namespace_manager.bind(namespace, self.annotationNamespaces[namespace])

        self.log.debug("Adding some schema information (dimension and measure properties) ")
        self.addDataCellProperty()

        self.graph.add((self.namespaces["d2s"]["dimension"], RDF.type, self.namespaces["qb"]["DimensionProperty"]))

        self.graph.add((self.namespaces["d2s"]["label"], RDF.type, RDF["Property"]))

    def addDataCellProperty(self):
        """Add definition of data cell resource to graph"""

        if len(self.config.get("dataCell", "propertyName")) > 0:
            self.dataCellPropertyName = self.config.get("dataCell", "propertyName")
        else:
            self.dataCellPropertyName = "hasValue"

        self.graph.add(
            (self.namespaces["d2s"][self.dataCellPropertyName], RDF.type, self.namespaces["qb"]["MeasureProperty"])
        )

        # Take labels from config
        if len(self.config.get("dataCell", "labels")) > 0:
            labels = self.config.get("dataCell", "labels").split(":::")
            for label in labels:
                labelProperties = label.split("-->")
                if len(labelProperties[0]) > 0 and len(labelProperties[1]) > 0:
                    self.graph.add(
                        (
                            self.namespaces["d2s"][self.dataCellPropertyName],
                            RDFS.label,
                            Literal(labelProperties[1], labelProperties[0]),
                        )
                    )

        if len(self.config.get("dataCell", "literalType")) > 0:
            self.graph.add(
                (
                    self.namespaces["d2s"][self.dataCellPropertyName],
                    RDFS.range,
                    URIRef(self.config.get("dataCell", "literalType")),
                )
            )

    def setScope(self, fileBasename):
        """Set the default namespace and base for all URIs of the current workbook"""
        self.fileBasename = fileBasename
        scopeNamespace = self.defaultNamespacePrefix + fileBasename + "/"

        # Annotations go to a different namespace
        annotationScopeNamespace = self.annotationsNamespacePrefix + fileBasename + "/"

        self.log.debug("Adding namespace for {0}: {1}".format(fileBasename, scopeNamespace))

        self.namespaces["scope"] = Namespace(scopeNamespace)
        self.annotationNamespaces["scope"] = Namespace(annotationScopeNamespace)
        self.graph.namespace_manager.bind("", self.namespaces["scope"])
        self.annotationGraph.namespace_manager.bind("", self.annotationNamespaces["scope"])

    def doLink(self):
        """Start tablinker for all sheets in workbook"""
        self.log.info("Starting TabLinker for all sheets in workbook")

        for n in range(self.rb.nsheets):
            self.log.debug("Starting with sheet {0}".format(n))
            self.r_sheet = self.rb.sheet_by_index(n)
            self.w_sheet = self.wb.get_sheet(n)

            self.rowns, self.colns = self.getValidRowsCols()

            self.sheet_qname = urllib.quote(re.sub("\s", "_", self.r_sheet.name))
            self.log.debug("Base for QName generator set to: {0}".format(self.sheet_qname))

            self.log.debug("Starting parser")
            self.parseSheet()

    ###
    #    Utility Functions
    ###

    def insideMergeBox(self, i, j):
        """
        Check if the specified cell is inside a merge box

        Arguments:
        i -- row
        j -- column

        Returns:
        True/False -- depending on whether the cell is inside a merge box
        """
        self.merged_cells = self.r_sheet.merged_cells
        for crange in self.merged_cells:
            rlo, rhi, clo, chi = crange
            if i <= rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo:
                return True
        return False

    def getMergeBoxCoord(self, i, j):
        """
        Get the top-left corner cell of the merge box containing the specified cell

        Arguments:
        i -- row
        j -- column

        Returns:
        (k, l) -- Coordinates of the top-left corner of the merge box
        """
        if not self.insideMergeBox(i, j):
            return (-1, -1)

        self.merged_cells = self.r_sheet.merged_cells
        for crange in self.merged_cells:
            rlo, rhi, clo, chi = crange
            if i <= rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo:
                return (rlo, clo)

    def getType(self, style):
        """Get type for a given excel style. Style name must be prefixed by 'TL '
    
        Arguments:
        style -- Style (string) to check type for
        
        Returns:
        String -- The type of this field. In case none is found, 'unknown'
        """
        typematch = re.search("TL\s(.*)", style)
        if typematch:
            cellType = typematch.group(1)
        else:
            cellType = "Unknown"
        return cellType

    def isEmpty(self, i, j):
        """Check whether cell is empty.
        
        Arguments:
        i -- row
        j -- column
        
        Returns:
        True/False -- depending on whether the cell is empty
        """
        if (
            self.r_sheet.cell(i, j).ctype == XL_CELL_EMPTY or self.r_sheet.cell(i, j).ctype == XL_CELL_BLANK
        ) or self.r_sheet.cell(i, j).value == "":
            return True
        else:
            return False

    def isEmptyRow(self, i, colns):
        """
        Determine whether the row 'i' is empty by iterating over all its cells
        
        Arguments:
        i     -- The index of the row to be checked.
        colns -- The number of columns to be checked
        
        Returns:
        true  -- if the row is empty
        false -- if the row is not empty
        """
        for j in range(0, colns):
            if not self.isEmpty(i, j):
                return False
        return True

    def isEmptyColumn(self, j, rowns):
        """
        Determine whether the column 'j' is empty by iterating over all its cells
        
        Arguments:
        j     -- The index of the column to be checked.
        rowns -- The number of rows to be checked
        
        Returns:
        true  -- if the column is empty
        false -- if the column is not empty
        """
        for i in range(0, rowns):
            if not self.isEmpty(i, j):
                return False
        return True

    def getValidRowsCols(self):
        """
        Determine the number of non-empty rows and columns in the Excel sheet
        
        Returns:
        rowns -- number of rows
        colns -- number of columns
        """
        colns = number_of_good_cols(self.r_sheet)
        rowns = number_of_good_rows(self.r_sheet)

        # Check whether the number of good columns and rows are correct
        while self.isEmptyRow(rowns - 1, colns):
            rowns = rowns - 1
        while self.isEmptyColumn(colns - 1, rowns):
            colns = colns - 1

        self.log.debug("Number of rows with content:    {0}".format(rowns))
        self.log.debug("Number of columns with content: {0}".format(colns))
        return rowns, colns

    def getQName(self, names):
        """
        Create a valid QName from a string or dictionary of names
        
        Arguments:
        names -- Either dictionary of names or string of a name.
        
        Returns:
        qname -- a valid QName for the dictionary or string
        """

        if type(names) == dict:
            qname = self.sheet_qname
            for k in names:
                qname = qname + "/" + self.processString(names[k])
        else:
            qname = self.sheet_qname + "/" + self.processString(names)

        self.log.debug("Minted new QName: {}".format(qname))
        return qname

    def processString(self, string):
        """
        Remove illegal characters (comma, brackets, etc) from string, and replace it with underscore. Useful for URIs
        
        Arguments:
        string -- The string representing the value of the source cell
        
        Returns:
        processedString -- The processed string
        """

        return urllib.quote(re.sub("\s|\(|\)|,|\.", "_", unicode(string).strip()).encode("utf-8", "ignore"))

    def addValue(self, source_cell_value, altLabel=None):
        """
        Add a "value" + optional label to the graph for a cell in the source Excel sheet. The value is typically the value stored in the source cell itself, but may also be a copy of another cell (e.g. in the case of 'idem.').
        
        Arguments:
        source_cell_value -- The string representing the value of the source cell
        
        Returns:
        source_cell_value_qname -- a valid QName for the value of the source cell
        """
        source_cell_value_qname = self.getQName(source_cell_value)
        self.graph.add(
            (
                self.namespaces["scope"][source_cell_value_qname],
                self.namespaces["qb"]["dataSet"],
                self.namespaces["scope"][self.sheet_qname],
            )
        )

        self.graph.add(
            (
                self.namespaces["scope"][self.source_cell_qname],
                self.namespaces["d2s"]["value"],
                self.namespaces["scope"][source_cell_value_qname],
            )
        )

        # If the source_cell_value is actually a dictionary (e.g. in the case of HierarchicalRowHeaders), then use the last element of the row hierarchy as prefLabel
        # Otherwise just use the source_cell_value as prefLabel
        if type(source_cell_value) == dict:
            self.graph.add(
                (
                    self.namespaces["scope"][source_cell_value_qname],
                    self.namespaces["skos"].prefLabel,
                    Literal(source_cell_value.values()[-1], "nl"),
                )
            )

            if altLabel and altLabel != source_cell_value.values()[-1]:
                # If altLabel has a value (typically for HierarchicalRowHeaders) different from the last element in the row hierarchy, we add it as alternative label.
                self.graph.add(
                    (
                        self.namespaces["scope"][source_cell_value_qname],
                        self.namespaces["skos"].altLabel,
                        Literal(altLabel, "nl"),
                    )
                )
        else:
            # Try to parse a date to add the appropriate datatype to the literal
            try:
                isodate.parse_datetime(source_cell_value)
                self.log.debug("Datetime on this cell: %s" % source_cell_value)
                self.graph.add(
                    (
                        self.namespaces["scope"][source_cell_value_qname],
                        self.namespaces["skos"].prefLabel,
                        Literal(source_cell_value, datatype=XSD.datetime),
                    )
                )
            except (ValueError, isodate.isoerror.ISO8601Error, AttributeError):
                self.log.debug("No datetime on this cell")
                self.graph.add(
                    (
                        self.namespaces["scope"][source_cell_value_qname],
                        self.namespaces["skos"].prefLabel,
                        Literal(source_cell_value, "nl"),
                    )
                )

            if altLabel and altLabel != source_cell_value:
                # If altLabel has a value (typically for HierarchicalRowHeaders) different from the source_cell_value, we add it as alternative label.
                self.graph.add(
                    (
                        self.namespaces["scope"][source_cell_value_qname],
                        self.namespaces["skos"].altLabel,
                        Literal(altLabel, "nl"),
                    )
                )

        return source_cell_value_qname

    def parseSheet(self):
        """
        Parses the currently selected sheet in the workbook, takes no arguments. Iterates over all cells in the Excel sheet and produces relevant RDF Triples. 
        """
        self.log.info("Parsing {0} rows and {1} columns.".format(self.rowns, self.colns))

        self.column_dimensions = {}
        self.property_dimensions = {}
        self.row_dimensions = {}
        self.rowhierarchy = {}

        # Get dictionary of annotations
        self.annotations = self.r_sheet.cell_note_map

        for i in range(0, self.rowns):
            self.rowhierarchy[i] = {}

            for j in range(0, self.colns):
                # Parse cell data
                self.source_cell = self.r_sheet.cell(i, j)
                self.source_cell_name = cellname(i, j)
                self.style = self.styles[self.source_cell].name
                self.cellType = self.getType(self.style)
                self.source_cell_qname = self.getQName(self.source_cell_name)

                self.log.debug(
                    '({},{}) {}/{}: "{}"'.format(i, j, self.cellType, self.source_cell_name, self.source_cell.value)
                )

                # Try to parse ints to avoid ugly _0 URIs
                try:
                    if int(self.source_cell.value) == self.source_cell.value:
                        self.source_cell.value = int(self.source_cell.value)
                except ValueError:
                    self.log.debug("(%s.%s) No parseable int" % (i, j))

                # Parse annotation (if any)
                if self.config.get("annotations", "enabled") == "1":
                    if (i, j) in self.annotations:
                        self.parseAnnotation(i, j)

                # Parse even if empty
                if self.cellType == "HRowHeader":
                    self.updateRowHierarchy(i, j)
                if self.cellType == "Data":
                    self.parseData(i, j)
                if self.cellType == "ColHeader":
                    self.parseColHeader(i, j)
                if self.cellType == "RowProperty":
                    self.parseRowProperty(i, j)

                if not self.isEmpty(i, j):
                    self.graph.add(
                        (
                            self.namespaces["scope"][self.source_cell_qname],
                            RDF.type,
                            self.namespaces["d2s"][self.cellType],
                        )
                    )
                    self.graph.add(
                        (
                            self.namespaces["scope"][self.source_cell_qname],
                            self.namespaces["d2s"]["cell"],
                            Literal(self.source_cell_name),
                        )
                    )
                    # self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['d2s']['col'],Literal(colname(j))))
                    # self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['d2s']['row'],Literal(i+1)))
                    # self.graph.add((self.namespaces['scope'][self.source_cell_qname] isrow row
                    if self.cellType == "Title":
                        self.parseTitle(i, j)

                    elif self.cellType == "RowHeader":
                        self.parseRowHeader(i, j)

                    elif self.cellType == "HRowHeader":
                        self.parseHierarchicalRowHeader(i, j)

                    elif self.cellType == "RowLabel":
                        self.parseRowLabel(i, j)

        self.log.info("Done parsing...")

    def updateRowHierarchy(self, i, j):
        """
        Build up lists for hierarchical row headers. Cells marked as hierarchical row header are often empty meaning that their intended value is stored somewhere else in the Excel sheet.
        
        Keyword arguments:
        int i -- row number
        int j -- col number
        
        Returns:
        New row hierarchy dictionary
        """
        if self.isEmpty(i, j) or str(self.source_cell.value).lower().strip() == "id.":
            # If the cell is empty, and a HierarchicalRowHeader, add the value of the row header above it.
            # If the cell above is not in the rowhierarchy, don't do anything.
            # If the cell is exactly 'id.', add the value of the row header above it.
            try:
                self.rowhierarchy[i][j] = self.rowhierarchy[i - 1][j]
                self.log.debug("({},{}) Copied from above\nRow hierarchy: {}".format(i, j, self.rowhierarchy[i]))
            except:
                # REMOVED because of double slashes in uris
                # self.rowhierarchy[i][j] = self.source_cell.value
                self.log.debug("({},{}) Top row, added nothing\nRow hierarchy: {}".format(i, j, self.rowhierarchy[i]))
        elif str(self.source_cell.value).lower().startswith("id.") or str(self.source_cell.value).lower().startswith(
            "id "
        ):
            # If the cell starts with 'id.', add the value of the row  above it, and append the rest of the cell's value.
            suffix = self.source_cell.value[3:]
            try:
                self.rowhierarchy[i][j] = self.rowhierarchy[i - 1][j] + suffix
                self.log.debug("({},{}) Copied from above+suffix\nRow hierarchy {}".format(i, j, self.rowhierarchy[i]))
            except:
                self.rowhierarchy[i][j] = self.source_cell.value
                self.log.debug("({},{}) Top row, added value\nRow hierarchy {}".format(i, j, self.rowhierarchy[i]))
        elif not self.isEmpty(i, j):
            self.rowhierarchy[i][j] = self.source_cell.value
            self.log.debug("({},{}) Added value\nRow hierarchy {}".format(i, j, self.rowhierarchy[i]))
        return self.rowhierarchy

    def parseHierarchicalRowHeader(self, i, j):
        """
        Create relevant triples for the cell marked as HierarchicalRowHeader (i, j are row and column)
        """

        # Use the rowhierarchy to create a unique qname for the cell's contents, give the source_cell's original value as extra argument
        self.log.debug("Parsing HierarchicalRowHeader")

        self.source_cell_value_qname = self.addValue(self.rowhierarchy[i], altLabel=self.source_cell.value)

        # Now that we know the source cell's value qname, add a d2s:isDimension link and the skos:Concept type
        self.graph.add(
            (
                self.namespaces["scope"][self.source_cell_qname],
                self.namespaces["d2s"]["isDimension"],
                self.namespaces["scope"][self.source_cell_value_qname],
            )
        )
        self.graph.add((self.namespaces["scope"][self.source_cell_qname], RDF.type, self.namespaces["skos"].Concept))

        hierarchy_items = self.rowhierarchy[i].items()
        try:
            parent_values = dict(hierarchy_items[:-1])
            self.log.debug(i, j, "Parent value: " + str(parent_values))
            parent_value_qname = self.getQName(parent_values)
            self.graph.add(
                (
                    self.namespaces["scope"][self.source_cell_value_qname],
                    self.namespaces["skos"]["broader"],
                    self.namespaces["scope"][parent_value_qname],
                )
            )
        except:
            self.log.debug(i, j, "Top of hierarchy")

        # Get the properties to use for the row headers
        try:
            properties = []
            for dim_qname in self.property_dimensions[j]:
                properties.append(dim_qname)
        except KeyError:
            self.log.debug("({}.{}) No row dimension for cell".format(i, j))

        self.row_dimensions.setdefault(i, []).append((self.source_cell_value_qname, properties))

    def parseRowLabel(self, i, j):
        """
        Create relevant triples for the cell marked as Label (i, j are row and column)
        """

        self.log.debug("Parsing Row Label")

        # Get the QName of the HierarchicalRowHeader cell that this label belongs to, based on the rowhierarchy for this row (i)
        hierarchicalRowHeader_value_qname = self.getQName(self.rowhierarchy[i])

        prefLabels = self.graph.objects(
            self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].prefLabel
        )
        for label in prefLabels:
            # If the hierarchicalRowHeader QName already has a preferred label, turn it into a skos:altLabel
            self.graph.remove(
                (self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].prefLabel, label)
            )
            self.graph.add(
                (self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].altLabel, label)
            )
            self.log.debug(
                "Turned skos:prefLabel {} for {} into a skos:altLabel".format(label, hierarchicalRowHeader_value_qname)
            )

        # Add the value of the label cell as skos:prefLabel to the header cell
        self.graph.add(
            (
                self.namespaces["scope"][hierarchicalRowHeader_value_qname],
                self.namespaces["skos"].prefLabel,
                Literal(self.source_cell.value, "nl"),
            )
        )

        # Record that this source_cell_qname is the label for the HierarchicalRowHeader cell
        self.graph.add(
            (
                self.namespaces["scope"][self.source_cell_qname],
                self.namespaces["d2s"]["isLabel"],
                self.namespaces["scope"][hierarchicalRowHeader_value_qname],
            )
        )

    def parseRowHeader(self, i, j):
        """
        Create relevant triples for the cell marked as RowHeader (i, j are row and column)
        """
        rowHeaderValue = ""

        # Don't attach the cell value to the namespace if it's already a URI
        isURI = urlparse(str(self.source_cell.value))
        if isURI.scheme and isURI.netloc:
            rowHeaderValue = URIRef(self.source_cell.value)
        else:
            self.source_cell_value_qname = self.addValue(self.source_cell.value)
            rowHeaderValue = self.namespaces["scope"][self.source_cell_value_qname]

        self.graph.add(
            (self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isDimension"], rowHeaderValue)
        )
        self.graph.add((rowHeaderValue, RDF.type, self.namespaces["d2s"]["Dimension"]))
        self.graph.add((rowHeaderValue, RDF.type, self.namespaces["skos"].Concept))

        # Get the properties to use for the row headers
        try:
            properties = []
            for dim_qname in self.property_dimensions[j]:
                properties.append(dim_qname)
        except KeyError:
            self.log.debug("({}.{}) No properties for cell".format(i, j))
        self.row_dimensions.setdefault(i, []).append((rowHeaderValue, properties))

        # Use the column dimensions dictionary to find the objects of the d2s:dimension property
        try:
            for dim_qname in self.column_dimensions[j]:
                self.graph.add(
                    (rowHeaderValue, self.namespaces["d2s"]["dimension"], self.namespaces["scope"][dim_qname])
                )
        except KeyError:
            self.log.debug("({}.{}) No column dimension for cell".format(i, j))

        return

    def parseColHeader(self, i, j):
        """
        Create relevant triples for the cell marked as Header (i, j are row and column)
        """
        if self.isEmpty(i, j):
            if self.insideMergeBox(i, j):
                k, l = self.getMergeBoxCoord(i, j)
                self.source_cell_value_qname = self.addValue(self.r_sheet.cell(k, l).value)
            else:
                return
        else:
            self.source_cell_value_qname = self.addValue(self.source_cell.value)

        self.graph.add(
            (
                self.namespaces["scope"][self.source_cell_qname],
                self.namespaces["d2s"]["isDimension"],
                self.namespaces["scope"][self.source_cell_value_qname],
            )
        )
        self.graph.add(
            (self.namespaces["scope"][self.source_cell_value_qname], RDF.type, self.namespaces["d2s"]["Dimension"])
        )
        self.graph.add((self.namespaces["scope"][self.source_cell_qname], RDF.type, self.namespaces["skos"].Concept))

        # Add the value qname to the column_dimensions list for that column
        self.column_dimensions.setdefault(j, []).append(self.source_cell_value_qname)

        return

    def parseRowProperty(self, i, j):
        """
        Create relevant triples for the cell marked as Property (i, j are row and column)
        """
        if self.isEmpty(i, j):
            if self.insideMergeBox(i, j):
                k, l = self.getMergeBoxCoord(i, j)
                self.source_cell_value_qname = self.addValue(self.r_sheet.cell(k, l).value)
            else:
                return
        else:
            self.source_cell_value_qname = self.addValue(self.source_cell.value)
        self.graph.add(
            (
                self.namespaces["scope"][self.source_cell_qname],
                self.namespaces["d2s"]["isDimensionProperty"],
                self.namespaces["scope"][self.source_cell_value_qname],
            )
        )
        self.graph.add(
            (
                self.namespaces["scope"][self.source_cell_value_qname],
                RDF.type,
                self.namespaces["qb"]["DimensionProperty"],
            )
        )
        self.graph.add((self.namespaces["scope"][self.source_cell_value_qname], RDF.type, RDF["Property"]))

        self.property_dimensions.setdefault(j, []).append(self.source_cell_value_qname)

        return

    def parseTitle(self, i, j):
        """
        Create relevant triples for the cell marked as Title (i, j are row and column)
        """

        self.source_cell_value_qname = self.addValue(self.source_cell.value)
        self.graph.add(
            (
                self.namespaces["scope"][self.sheet_qname],
                self.namespaces["d2s"]["title"],
                self.namespaces["scope"][self.source_cell_value_qname],
            )
        )
        self.graph.add(
            (self.namespaces["scope"][self.source_cell_value_qname], RDF.type, self.namespaces["d2s"]["Dimension"])
        )

        return

    def parseData(self, i, j):
        """
        Create relevant triples for the cell marked as Data (i, j are row and column)
        """

        if self.isEmpty(i, j) and self.config.get("dataCell", "implicitZeros") == "0":
            return

        observation = BNode()

        self.graph.add(
            (self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isObservation"], observation)
        )
        self.graph.add((observation, RDF.type, self.namespaces["qb"]["Observation"]))
        self.graph.add((observation, self.namespaces["qb"]["dataSet"], self.namespaces["scope"][self.sheet_qname]))
        if self.isEmpty(i, j) and self.config.get("dataCell", "implicitZeros") == "1":
            self.graph.add((observation, self.namespaces["d2s"][self.dataCellPropertyName], Literal(0)))
        else:
            self.graph.add(
                (observation, self.namespaces["d2s"][self.dataCellPropertyName], Literal(self.source_cell.value))
            )

        # Use the row dimensions dictionary to find the properties that link data values to row headers
        try:
            for (dim_qname, properties) in self.row_dimensions[i]:
                for p in properties:
                    self.graph.add((observation, self.namespaces["d2s"][p], dim_qname))
        except KeyError:
            self.log.debug("({}.{}) No row dimension for cell".format(i, j))

        # Use the column dimensions dictionary to find the objects of the d2s:dimension property
        try:
            for dim_qname in self.column_dimensions[j]:
                self.graph.add((observation, self.namespaces["d2s"]["dimension"], self.namespaces["scope"][dim_qname]))
        except KeyError:
            self.log.debug("({}.{}) No column dimension for cell".format(i, j))

    def parseAnnotation(self, i, j):
        """
        Create relevant triples for the annotation attached to cell (i, j)
        """

        if self.config.get("annotations", "model") == "oa":
            # Create triples according to Open Annotation model

            body = BNode()

            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    RDF.type,
                    self.annotationNamespaces["oa"]["Annotation"],
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["hasBody"],
                    body,
                )
            )
            self.annotationGraph.add(
                (
                    body,
                    RDF.value,
                    Literal(
                        self.annotations[(i, j)]
                        .text.replace("\n", " ")
                        .replace("\r", " ")
                        .replace("\r\n", " ")
                        .encode("utf-8")
                    ),
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["hasTarget"],
                    self.namespaces["scope"][self.source_cell_qname],
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["annotator"],
                    Literal(self.annotations[(i, j)].author.encode("utf-8")),
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["annotated"],
                    Literal(
                        datetime.datetime.fromtimestamp(os.path.getmtime(self.filename)).strftime("%Y-%m-%d"),
                        datatype=self.annotationNamespaces["xsd"]["date"],
                    ),
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["generator"],
                    URIRef("https://github.com/Data2Semantics/TabLinker"),
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["generated"],
                    Literal(
                        datetime.datetime.now().strftime("%Y-%m-%d"), datatype=self.annotationNamespaces["xsd"]["date"]
                    ),
                )
            )
            self.annotationGraph.add(
                (
                    self.annotationNamespaces["scope"][self.source_cell_qname],
                    self.annotationNamespaces["oa"]["modelVersion"],
                    URIRef("http://www.openannotation.org/spec/core/20120509.html"),
                )
            )
        else:
            # Create triples according to Nanopublications model
            print "Nanopublications not implemented yet!"
Ejemplo n.º 44
0
def graph_plan(plan, fountain, agp):
    def extract_cycle_roots():
        c_roots = {}
        for c_id, c_node in described_cycles.items():
            c_root_types = set({})
            for crt in plan_graph.objects(c_node, AGORA.expectedType):
                crt_qname = plan_graph.qname(crt)
                c_root_types.update(_type_subtree(fountain, crt_qname))
            c_roots[c_id] = c_root_types
        return c_roots

    def inc_tree_length(tree, l):
        if tree not in tree_lengths:
            tree_lengths[tree] = 0
        tree_lengths[tree] += l

    def add_variable(p_node, vid, subject=True):
        sub_node = BNode(str(vid).replace('?', 'var_'))
        if subject:
            plan_graph.add((p_node, AGORA.subject, sub_node))
        else:
            plan_graph.add((p_node, AGORA.object, sub_node))
        plan_graph.set((sub_node, RDF.type, AGORA.Variable))
        plan_graph.set((sub_node, RDFS.label, Literal(str(vid), datatype=XSD.string)))

    def describe_cycle(cycle_id, cg):
        c_node = BNode('cycle{}'.format(cycle_id))
        cg = cg.get_context(c_node)
        cg.add((c_node, RDF.type, AGORA.Cycle))
        previous_node = c_node
        c_steps = cycles[cycle_id]
        cycle_type = c_steps[0].get('type')
        for et in _type_subtree(fountain, cycle_type):
            cg.add((c_node, AGORA.expectedType, __extend_uri(prefixes, et)))
        for j, step in enumerate(c_steps):
            prop = step.get('property')
            b_node = BNode(previous_node.n3() + '/' + prop)
            cg.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop)))
            c_expected_type = step.get('type')
            cg.add((b_node, AGORA.expectedType, __extend_uri(prefixes, c_expected_type)))
            cg.add((previous_node, AGORA.next, b_node))
            previous_node = b_node
        return c_node

    def is_extensible(node, node_patterns):
        extensible = True
        near_patterns = node_patterns.copy()
        for prev in tree_graph.subjects(AGORA.next, node):
            for sib_node in tree_graph.objects(prev, AGORA.next):
                if sib_node != res.n:
                    near_patterns.update(set(tree_graph.objects(sib_node, AGORA.byPattern)))

        subjects = set()
        for p_node in near_patterns:
            p_subject = list(plan_graph.objects(p_node, AGORA.subject)).pop()
            if not isinstance(p_subject, URIRef):
                subject_str = list(plan_graph.objects(p_subject, RDFS.label)).pop().toPython()
            else:
                subject_str = str(p_subject)
            subjects.add(subject_str)

        if subjects and set.difference(subjects, roots):
            extensible = False

        return extensible

    def enrich_type_patterns(node_patterns):
        for p_node in node_patterns:
            p_pred = list(plan_graph.objects(p_node, AGORA.predicate)).pop()
            if p_pred == RDF.type:
                p_type = list(plan_graph.objects(p_node, AGORA.object)).pop()
                if isinstance(p_type, URIRef):
                    for et in [et for et in expected_types if et == p_type]:
                        q_expected_types = _type_subtree(fountain, tree_graph.qname(et))
                        for et_q in q_expected_types:
                            tree_graph.add((res.n, AGORA.expectedType, __extend_uri(prefixes, et_q)))
            else:
                for et in expected_types:
                    q_expected_types = _type_subtree(fountain, tree_graph.qname(et))
                    for et_q in q_expected_types:
                        tree_graph.add((res.n, AGORA.expectedType, __extend_uri(prefixes, et_q)))

    def apply_cycle_extensions(c_roots, node_types):
        for c_id, root_types in c_roots.items():
            found_extension = False
            for n, expected in node_types.items():
                if set.intersection(set(root_types), set(expected)):
                    tree_graph.add((n, AGORA.isCycleStartOf, described_cycles[c_id]))
                    found_extension = True

            if not found_extension:
                plan_graph.remove_context(plan_graph.get_context(described_cycles[c_id]))

    def include_path(elm, p_seeds, p_steps, cycles, check):
        m = hashlib.md5()
        for s in p_seeds:
            m.update(s)
        elm_uri = __extend_uri(prefixes, elm)
        b_tree = BNode(m.digest().encode('base64').strip())
        s_trees.add(b_tree)
        tree_graph.set((b_tree, RDF.type, AGORA.SearchTree))
        tree_graph.add((b_tree, AGORA.fromType, elm_uri))

        for seed in p_seeds:
            tree_graph.add((b_tree, AGORA.hasSeed, URIRef(seed)))

        for cycle_id in filter(lambda x: x not in described_cycles.keys(), cycles):
            c_node = describe_cycle(cycle_id, plan_graph)
            described_cycles[cycle_id] = c_node
            plan_graph.get_context(c_node).add((b_tree, AGORA.goesThroughCycle, c_node))

        previous_node = b_tree
        inc_tree_length(b_tree, len(p_steps))

        root_index = -1
        pp = []
        for j, step in enumerate(p_steps):
            prop = step.get('property')
            pp.append(prop)
            path_root = step.get('root', None)
            if path_root and root_index < 0:
                root_index = j
            base_id = path_root or b_tree
            base_id += '/'

            if j < len(p_steps) - 1 or (pattern[1] == RDF.type and isinstance(pattern[2], URIRef)):
                b_node = BNode(base_id + '/'.join(pp))
                tree_graph.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop)))
            else:
                b_node = BNode(base_id + '/'.join(pp))

            tree_graph.add((b_node, AGORA.expectedType, __extend_uri(prefixes, step.get('type'))))
            tree_graph.add((previous_node, AGORA.next, b_node))
            previous_node = b_node

        p_node = _get_pattern_node(pattern, patterns)
        if pattern[1] == RDF.type and isinstance(pattern[2], URIRef):
            b_id = '{}_{}_{}'.format(pattern[0].n3(plan_graph.namespace_manager),
                                     pattern[1].n3(plan_graph.namespace_manager),
                                     pattern[2].n3(plan_graph.namespace_manager))

            b_node = BNode(b_id)
            tree_graph.add((b_node, AGORA.expectedType, pattern[2]))
            tree_graph.add((previous_node, AGORA.next, b_node))
            tree_graph.add((b_node, AGORA.byPattern, p_node))
            if check:
                tree_graph.add((b_node, AGORA.checkType, Literal(check)))
        else:
            tree_graph.add((previous_node, AGORA.byPattern, p_node))

    plan_graph = ConjunctiveGraph()
    plan_graph.bind('agora', AGORA)
    prefixes = plan.get('prefixes')
    ef_plan = plan.get('plan')
    tree_lengths = {}
    s_trees = set([])
    patterns = {}
    described_cycles = {}

    for (prefix, u) in prefixes.items():
        plan_graph.bind(prefix, u)

    tree_graph = plan_graph.get_context('trees')

    for i, tp_plan in enumerate(ef_plan):
        paths = tp_plan.get('paths')
        pattern = tp_plan.get('pattern')
        hints = tp_plan.get('hints')
        cycles = {}
        for c in tp_plan.get('cycles'):
            cid = str(c['cycle'])
            c_steps = c['steps']
            cycles[cid] = c_steps
            if len(c_steps) > 1:
                cycles[cid + 'r'] = list(reversed(c_steps))
        context = BNode('space_{}'.format(tp_plan.get('context')))

        for path in paths:
            steps = path.get('steps')
            seeds = path.get('seeds')
            check = path.get('check', None)
            ty = None
            if not len(steps) and len(seeds):
                ty = pattern[2]
            elif len(steps):
                ty = steps[0].get('type')
            if ty:
                include_path(ty, seeds, steps, cycles, check)

        for t in s_trees:
            tree_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer)))

        pattern_node = _get_pattern_node(pattern, patterns)
        plan_graph.add((context, AGORA.definedBy, pattern_node))
        plan_graph.set((context, RDF.type, AGORA.SearchSpace))
        plan_graph.add((pattern_node, RDF.type, AGORA.TriplePattern))
        plan_graph.add((pattern_node, RDFS.label, Literal(pattern_node.toPython())))
        (sub, pred, obj) = pattern

        if isinstance(sub, BNode):
            add_variable(pattern_node, str(sub))
        elif isinstance(sub, URIRef):
            plan_graph.add((pattern_node, AGORA.subject, sub))

        if isinstance(obj, BNode):
            add_variable(pattern_node, str(obj), subject=False)
        elif isinstance(obj, Literal):
            node = BNode(str(obj).replace(' ', '').replace(':', ''))
            plan_graph.add((pattern_node, AGORA.object, node))
            plan_graph.set((node, RDF.type, AGORA.Literal))
            plan_graph.set((node, AGORA.value, obj))
        else:
            plan_graph.add((pattern_node, AGORA.object, obj))

        plan_graph.add((pattern_node, AGORA.predicate, pred))
        if pred == RDF.type:
            if 'check' in hints:
                plan_graph.add((pattern_node, AGORA.checkType, Literal(hints['check'], datatype=XSD.boolean)))

    expected_res = tree_graph.query("""SELECT DISTINCT ?n WHERE {
                                          ?n agora:expectedType ?type
                                       }""")
    node_types = {}
    roots = set(_extract_roots(agp))

    for res in expected_res:
        expected_types = list(tree_graph.objects(res.n, AGORA.expectedType))

        q_expected_types = set(map(lambda x: tree_graph.qname(x), expected_types))
        q_expected_types = filter(
            lambda x: not set.intersection(set(fountain.get_type(x)['super']), q_expected_types), q_expected_types)
        type_hierarchy = len(q_expected_types) == 1
        tree_graph.add((res.n, AGORA.typeHierarchy, Literal(type_hierarchy)))

        direct_patterns = set(tree_graph.objects(res.n, AGORA.byPattern))
        enrich_type_patterns(direct_patterns)
        if is_extensible(res.n, direct_patterns):
            node_types[res.n] = q_expected_types

    c_roots = extract_cycle_roots()
    apply_cycle_extensions(c_roots, node_types)

    for t in s_trees:
        tree_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer)))
        from_types = set([plan_graph.qname(x) for x in plan_graph.objects(t, AGORA.fromType)])
        def_from_types = filter(lambda x: not set.intersection(set(fountain.get_type(x)['sub']), from_types),
                                from_types)
        for dft in def_from_types:
            tree_graph.set((t, AGORA.fromType, __extend_uri(prefixes, dft)))

    for res in plan_graph.query("""SELECT ?tree ?sub ?nxt WHERE {
                           ?tree a agora:SearchTree ;                              
                                 agora:next ?nxt .
                           ?nxt agora:byPattern [
                                   agora:subject ?sub 
                                ]                    
                        }"""):
        if isinstance(res.sub, URIRef):
            plan_graph.set((res.tree, AGORA.hasSeed, res.sub))
            plan_graph.remove((res.nxt, AGORA.isCycleStartOf, None))

    _inform_on_inverses(plan_graph, fountain, prefixes)

    return plan_graph
Ejemplo n.º 45
0
def get_vocab_base(vocabfile):
    graph = Graph()
    try:
        graph.parse(vocabfile)
    except:
        graph = None
        graph = Graph()
        try:
            graph.parse(vocabfile, format="n3")
        except:
            return (None, None, None)
    identifier = None
    for v in graph.objects(None, namespaces['dc']['identifier']):
        identifier = v
    if not identifier:
        for v in graph.objects(None, namespaces['dcterms']['identifier']):
            identifier = v

    base = None
    if not base:
        for s in graph.subjects(namespaces['rdf']['type'], namespaces['owl']['Ontology']):
            base = s
            break
    if not base:
        for s in graph.subjects(namespaces['dc']['title'], None):
            base = s
            break
    if not base:
        for s in graph.subjects(namespaces['dcterms']['title'], None):
            base = s
            break
    if not base:
        for s in graph.subjects(namespaces['dc']['creator'], None):
            base = s
            break
    if not base:
        for s in graph.subjects(namespaces['dcterms']['creator'], None):
            base = s
            break
    if not base:
        for v in graph.objects(None, namespaces['vann']['preferredNamespaceUri']):
            base = v
            break
    if not base:
        for v in graph.namespaces():
            if v[0] == '':
                base = v[1]
                break

    prefix = None
    vocab_prefixes = graph.objects(None, namespaces['vann']['preferredNamespacePrefix'])
    for vp in vocab_prefixes:
        prefix = vp
    if not prefix and base:
        for v in graph.namespaces():
            if str(v[1]) == str(base):
                prefix = v[0]
                break
    if not prefix and base:
        prefix = base.strip().strip('/').split('/')[-1].strip('#').strip(' ')
    if base:
        base = base.strip()
        if (base[-1]!="/" and base[-1]!="#"):
            base += "#"
    return (identifier, base, prefix)
Ejemplo n.º 46
0
class Inspector(object):
    """ Class that includes methods for querying an RDFS/OWL ontology """
    def __init__(self, uri, language=""):
        super(Inspector, self).__init__()
        self.rdfGraph = ConjunctiveGraph()
        try:
            self.rdfGraph.parse(uri, format="application/rdf+xml")
        except:
            try:
                self.rdfGraph.parse(uri, format="n3")
            except:
                raise exceptions.Error(
                    "Could not parse the file! Is it a valid RDF/OWL ontology?"
                )
        finally:
            self.baseURI = self.get_OntologyURI() or uri
            self.allclasses = self.__getAllClasses(includeDomainRange=True,
                                                   includeImplicit=True,
                                                   removeBlankNodes=False,
                                                   excludeRDF_OWL=False)

    def get_OntologyURI(self, return_as_string=True):
        test = [
            x for x, y, z in self.rdfGraph.triples((None, RDF.type, Ontology))
        ]
        if test:
            if return_as_string:
                return str(test[0])
            else:
                return test[0]
        else:
            return None

    def __getAllClasses(self,
                        classPredicate="",
                        includeDomainRange=False,
                        includeImplicit=False,
                        removeBlankNodes=True,
                        addOWLThing=True,
                        excludeRDF_OWL=True):

        rdfGraph = self.rdfGraph
        exit = {}

        def addIfYouCan(x, mydict):
            if excludeRDF_OWL:
                if x.startswith('http://www.w3.org/2002/07/owl#') or  \
                   x.startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") or \
                   x.startswith("http://www.w3.org/2000/01/rdf-schema#"):
                    return mydict
            if x not in mydict:
                mydict[x] = None
            return mydict

        if addOWLThing:
            exit = addIfYouCan(Thing, exit)

        if classPredicate == "rdfs" or classPredicate == "":
            for s in rdfGraph.subjects(RDF.type, RDFS.Class):
                exit = addIfYouCan(s, exit)

        if classPredicate == "owl" or classPredicate == "":
            for s in rdfGraph.subjects(RDF.type, Class):
                exit = addIfYouCan(s, exit)

        if includeDomainRange:
            for o in rdfGraph.objects(None, RDFS.domain):
                exit = addIfYouCan(o, exit)
            for o in rdfGraph.objects(None, RDFS.range):
                exit = addIfYouCan(o, exit)

        if includeImplicit:
            for s, v, o in rdfGraph.triples((None, RDFS.subClassOf, None)):
                exit = addIfYouCan(s, exit)
                exit = addIfYouCan(o, exit)
            for o in rdfGraph.objects(None, RDF.type):
                exit = addIfYouCan(o, exit)

        # get a list
        exit = exit.keys()
        if removeBlankNodes:
            exit = [x for x in exit if not isBlankNode(x)]
        return sort_uri_list_by_name(exit)

    def __getTopclasses(self, classPredicate=''):
        returnlist = []

        for eachclass in self.__getAllClasses(classPredicate):
            x = self.get_classDirectSupers(eachclass)
            if not x:
                returnlist.append(eachclass)
        return sort_uri_list_by_name(returnlist)

    def __getTree(self, father=None, out=None):
        if not father:
            out = {}
            topclasses = self.toplayer
            out[0] = topclasses

            for top in topclasses:
                children = self.get_classDirectSubs(top)
                out[top] = children
                for potentialfather in children:
                    self.__getTree(potentialfather, out)

            return out

        else:
            children = self.get_classDirectSubs(father)
            out[father] = children
            for ch in children:
                self.__getTree(ch, out)

    def __buildClassTree(self, father=None, out=None):
        if not father:
            out = {}
            topclasses = self.toplayer
            out[0] = [Thing]
            out[Thing] = sort_uri_list_by_name(topclasses)
            for top in topclasses:
                children = self.get_classDirectSubs(top)
                out[top] = sort_uri_list_by_name(children)
                for potentialfather in children:
                    self.__buildClassTree(potentialfather, out)
            return out
        else:
            children = self.get_classDirectSubs(father)
            out[father] = sort_uri_list_by_name(children)
            for ch in children:
                self.__buildClassTree(ch, out)

    # methods for getting ancestores and descendants of classes: by default, we do not include blank nodes
    def get_classDirectSupers(self,
                              aClass,
                              excludeBnodes=True,
                              sortUriName=False):
        returnlist = []
        for o in self.rdfGraph.objects(aClass, RDFS.subClassOf):
            if not (o == Thing):
                if excludeBnodes:
                    if not isBlankNode(o):
                        returnlist.append(o)
                else:
                    returnlist.append(o)
        if sortUriName:
            return sort_uri_list_by_name(remove_duplicates(returnlist))
        else:
            return remove_duplicates(returnlist)

    def get_classDirectSubs(self, aClass, excludeBnodes=True):
        returnlist = []
        for s, v, o in self.rdfGraph.triples((None, RDFS.subClassOf, aClass)):
            if excludeBnodes:
                if not isBlankNode(s):
                    returnlist.append(s)
            else:
                returnlist.append(s)
        return sort_uri_list_by_name(remove_duplicates(returnlist))

    def get_classSiblings(self, aClass, excludeBnodes=True):
        returnlist = []
        for father in self.get_classDirectSupers(aClass, excludeBnodes):
            for child in self.get_classDirectSubs(father, excludeBnodes):
                if child != aClass:
                    returnlist.append(child)

        return sort_uri_list_by_name(remove_duplicates(returnlist))

    def entitySynonyms(self, anEntity, language=DEFAULT_LANGUAGE, getall=True):
        if getall:
            temp = []
            # Uberon synonyms
            for o in self.rdfGraph.objects(anEntity, Synonym):
                temp += [o]
            # EFO synonyms
            for o in self.rdfGraph.objects(anEntity, Synonym):
                temp += [o]
            # OBI synonyms
            for o in self.rdfGraph.objects(anEntity, OBO_Synonym):
                temp += [o]
            return temp
        else:
            for o in self.rdfGraph.objects(anEntity, Synonym):
                if getattr(o, 'language') and getattr(o,
                                                      'language') == language:
                    return o
            return ""

    def classFind(self, name, exact=False):
        temp = []
        if name:
            for x in self.allclasses:
                if exact:
                    if x.__str__().lower() == str(name).lower():
                        return [x]
                else:
                    if x.__str__().lower().find(str(name).lower()) >= 0:
                        temp.append(x)
        return temp
Ejemplo n.º 47
0
def graph_plan(plan, fountain):
    plan_graph = ConjunctiveGraph()
    plan_graph.bind('agora', AGORA)
    prefixes = plan.get('prefixes')
    ef_plan = plan.get('plan')
    tree_lengths = {}
    s_trees = set([])
    patterns = {}

    for (prefix, u) in prefixes.items():
        plan_graph.bind(prefix, u)

    def __get_pattern_node(p):
        if p not in patterns:
            patterns[p] = BNode('tp_{}'.format(len(patterns)))
        return patterns[p]

    def __inc_tree_length(tree, l):
        if tree not in tree_lengths:
            tree_lengths[tree] = 0
        tree_lengths[tree] += l

    def __add_variable(p_node, vid, subject=True):
        sub_node = BNode(str(vid).replace('?', 'var_'))
        if subject:
            plan_graph.add((p_node, AGORA.subject, sub_node))
        else:
            plan_graph.add((p_node, AGORA.object, sub_node))
        plan_graph.set((sub_node, RDF.type, AGORA.Variable))
        plan_graph.set((sub_node, RDFS.label, Literal(str(vid), datatype=XSD.string)))

    def include_path(elm, p_seeds, p_steps):
        elm_uri = __extend_uri(prefixes, elm)
        path_g = plan_graph.get_context(elm_uri)
        b_tree = BNode(elm_uri)
        s_trees.add(b_tree)
        path_g.set((b_tree, RDF.type, AGORA.SearchTree))
        path_g.set((b_tree, AGORA.fromType, elm_uri))

        for seed in p_seeds:
            path_g.add((b_tree, AGORA.hasSeed, URIRef(seed)))

        previous_node = b_tree
        __inc_tree_length(b_tree, len(p_steps))
        for j, step in enumerate(p_steps):
            prop = step.get('property')
            b_node = BNode(previous_node.n3() + prop)
            if j < len(p_steps) - 1 or pattern[1] == RDF.type:
                path_g.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop)))
            path_g.add((b_node, AGORA.expectedType, __extend_uri(prefixes, step.get('type'))))
            path_g.add((previous_node, AGORA.next, b_node))
            previous_node = b_node

        p_node = __get_pattern_node(pattern)
        path_g.add((previous_node, AGORA.byPattern, p_node))

    for i, tp_plan in enumerate(ef_plan):
        paths = tp_plan.get('paths')
        pattern = tp_plan.get('pattern')
        hints = tp_plan.get('hints')
        context = BNode('space_{}'.format(tp_plan.get('context')))
        for path in paths:
            steps = path.get('steps')
            seeds = path.get('seeds')
            if not len(steps) and len(seeds):
                include_path(pattern[2], seeds, steps)
            elif len(steps):
                ty = steps[0].get('type')
                include_path(ty, seeds, steps)

        for t in s_trees:
            plan_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer)))

        pattern_node = __get_pattern_node(pattern)
        plan_graph.add((context, AGORA.definedBy, pattern_node))
        plan_graph.set((context, RDF.type, AGORA.SearchSpace))
        plan_graph.add((pattern_node, RDF.type, AGORA.TriplePattern))
        (sub, pred, obj) = pattern

        if isinstance(sub, BNode):
            __add_variable(pattern_node, str(sub))
        elif isinstance(sub, URIRef):
            plan_graph.add((pattern_node, AGORA.subject, sub))

        if isinstance(obj, BNode):
            __add_variable(pattern_node, str(obj), subject=False)
        elif isinstance(obj, Literal):
            node = BNode(str(obj).replace(' ', ''))
            plan_graph.add((pattern_node, AGORA.object, node))
            plan_graph.set((node, RDF.type, AGORA.Literal))
            plan_graph.set((node, AGORA.value, Literal(str(obj), datatype=XSD.string)))
        else:
            plan_graph.add((pattern_node, AGORA.object, obj))

        plan_graph.add((pattern_node, AGORA.predicate, pred))
        if pred == RDF.type:
            if 'check' in hints:
                plan_graph.add((pattern_node, AGORA.checkType, Literal(hints['check'], datatype=XSD.boolean)))

        sub_expected = plan_graph.subjects(predicate=AGORA.expectedType)
        for s in sub_expected:
            expected_types = list(plan_graph.objects(s, AGORA.expectedType))
            for et in expected_types:
                plan_graph.remove((s, AGORA.expectedType, et))
            q_expected_types = [plan_graph.qname(t) for t in expected_types]
            expected_types = [d for d in expected_types if
                              not set.intersection(set(fountain.get_type(plan_graph.qname(d)).get('super')),
                                                   set(q_expected_types))]
            for et in expected_types:
                plan_graph.add((s, AGORA.expectedType, et))

    return plan_graph
Ejemplo n.º 48
0
def retrieve(request, graph):
    
    try :
        cg = ConjunctiveGraph().parse(data=graph, format='n3')
    except :   
        return not_turtle_response(graph)

    
    DRUG = Namespace('http://aers.data2semantics.org/resource/drug/')
    PO = Namespace('http://www.data2semantics.org/ontology/patient/')
    UMLS = Namespace('http://linkedlifedata.com/resource/umls/id/')
    LS = Namespace('http://linkedlifedata.com/resource/lifeskim/')
    
    cg.bind('drug',DRUG)
    cg.bind('po',PO)
    cg.bind('umls',UMLS)
    cg.bind('lifeskim',LS)
    
    try :
        patient = cg.value(predicate=RDF.type, object=PO['Patient'], any=False)
    except:
        # More than one patient
        return multiple_patients_response(cg.serialize(format='turtle'))
        
    if (cg.value(predicate=PO['hasIndication'],object=UMLS['C0027947']) and cg.value(predicate=PO['hasMeasurement'],object=UMLS['C0015967'])) :
        # We now know the patient has Febrile Neutropenia
        cg.add((patient,PO['hasIndication'],UMLS['C0746883']))
        
    aers_sparql = SPARQLWrapper("http://eculture2.cs.vu.nl:5020/sparql/")
    aers_sparql.setReturnFormat(JSON)

    lld_sparql = SPARQLWrapper("http://linkedlifedata.com/sparql")
    lld_sparql.setReturnFormat(JSON)
    
    ranking = Counter()
    
    # Chain generators for all values for the attributes of the patient
    features = itertools.chain(cg.objects(subject=patient, predicate=PO['hasIndication']), \
        cg.objects(subject=patient, predicate=PO['hasMeasurement']), \
        cg.objects(subject=patient, predicate=PO['usesMedication']), \
        cg.objects(subject=patient, predicate=PO['hadPreviousIndication']), \
        cg.objects(subject=patient, predicate=PO['hadRecentTreatment']))
        
    exp_features = set()
    q_part = ""

    # First get all sameAs uris for the values
    for f in features :
        if str(f).startswith('http://linkedlifedata.com'): 
            exp_features.add(str(f))
        
        q_part += "{?altname owl:sameAs <"+f+"> .} UNION { <"+f+"> owl:sameAs ?altname .} UNION \n"

    q_part = q_part[:-8]
    
    q = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        SELECT ?altname
        WHERE { """ + q_part + """ }
    """

    aers_sparql.setQuery(q)
    
    results = aers_sparql.query().convert()

    # Only query LLD for stuff that LLD knows about (saves quite some time)
    for result in results["results"]["bindings"]:
        if result["altname"]["value"].startswith('http://linkedlifedata.com') :
            exp_features.add(result["altname"]["value"])
        
    # Then lookup the publications that mention these, and add them to a tally (Counter)
    for ef in exp_features :
        q = """
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX lifeskim: <http://linkedlifedata.com/resource/lifeskim/>
            SELECT ?pubmed
            WHERE { ?pubmed lifeskim:mentions <"""+ef+"""> . }
            LIMIT 250
        """
        lld_sparql.setQuery(q)
        
        results = lld_sparql.query().convert()

        for result in results["results"]["bindings"]:
            ranking[result["pubmed"]["value"]] += 1
    
    # Return only the 20 most frequent publications
    ranking_json = json.dumps(ranking.most_common(50))
    # print ranking_json
    return HttpResponse(ranking_json, mimetype='application/json')
Ejemplo n.º 49
0
print("creating mapping ...")
res = es_con.indices.put_mapping(index=index_name, doc_type="doc", body=thesaurus_mapping)
print("resonse: {}".format(res))

i = 0
for uri in graph.query(querystring):
    this_uri = uri[0]
    doc = {"uri": this_uri}
    j = 0
    for lang in ['ar', 'zh', 'en', 'fr', 'ru', 'es']:
        pref_labels = []
        for label in graph.preferredLabel(URIRef(this_uri), lang):
            pref_labels.append(label[1])
        doc.update({"labels_{}".format(lang): pref_labels})

        alt_labels = []
        for label in graph.objects(URIRef(this_uri), SKOS.altLabel):
            if label.language == lang:
                alt_labels.append(label)
        doc.update({"alt_labels_{}".format(lang): alt_labels})

        payload = json.dumps(doc)

        res = es_con.index(index=index_name, doc_type='doc', body=payload)
        doc = {"uri": this_uri}
        j += 1
    i += j
    if i % 50 == 0:
        print("{} fields indexed".format(i))
Ejemplo n.º 50
0
class PreProcessor(object):
    def __init__(self, kg_path):
        self.kg_path = kg_path
        self.ent_dict = dict()
        self.rel_dict = dict()
        self.g = ConjunctiveGraph()
        self.unique_msgs = self.ent_dict.copy()

    def load_knowledge_graph(self,
                             format='xml',
                             exclude_rels=[],
                             clean_schema=True,
                             amberg_params=None,
                             excluded_entities=None):
        self.g.load(self.kg_path, format=format)
        # remove triples with excluded relation
        remove_rel_triples(self.g, exclude_rels)
        # remove triples with relations between class-level constructs
        if clean_schema:
            remove_rel_triples(self.g, schema_relations)
        if excluded_entities is not None:
            remove_ent_triples(self.g, excluded_entities)
        if amberg_params:
            path_to_events = amberg_params[0]
            max_events = amberg_params[1]
            self.merged = get_merged_dataframe(path_to_events, max_events)
            self.unique_msgs, unique_vars, unique_mods, unique_fes = get_unique_entities(
                self.merged)
            update_amberg_ontology(self.g, self.ent_dict, self.unique_msgs,
                                   unique_mods, unique_fes, unique_vars,
                                   self.merged)

        self.update_entity_relation_dictionaries()

    def update_entity_relation_dictionaries(self):
        """
        Given an existing entity dictionary, update it to *ontology*
        :param ontology:
        :param ent_dict: the existing entity dictionary
        :return:
        """
        ent_counter = 0
        fixed_ids = set([id for id in self.ent_dict.values()])
        # sorting ensures equal random splits on equal seeds
        for h in sorted(
                set(self.g.subjects(None, None)).union(
                    set(self.g.objects(None, None)))):
            uni_h = unicode(h)
            if uni_h not in self.ent_dict:
                while ent_counter in fixed_ids:
                    ent_counter += 1
                self.ent_dict.setdefault(uni_h, ent_counter)
                ent_counter += 1
        # add new relations to dict
        for r in sorted(set(self.g.predicates(None, None))):
            uni_r = unicode(r)
            if uni_r not in self.rel_dict:
                self.rel_dict.setdefault(uni_r, len(self.rel_dict))

    def load_unique_msgs_from_txt(self, path, max_events=None):
        """
        Assuming csv text files with two columns
        :param path:
        :return:
        """
        with open(path, "rb") as f:
            for line in f:
                split = line.split(',')
                try:
                    emb_id = int(split[1].strip())
                except:
                    print("Error reading id of {0} in given dictionary".format(
                        line))
                    # skip this event entitiy, treat it as common entitiy later on
                    continue
                self.ent_dict[split[0]] = emb_id
        # sort ascending w.r.t. embedding id, in case of later stripping
        # self.ent_dict = sorted(self.ent_dict.items(), key=operator.itemgetter(1), reverse=False)
        self.unique_msgs = self.ent_dict.copy()
        if max_events is not None:
            all_msgs = sorted(self.unique_msgs.items(),
                              key=operator.itemgetter(1),
                              reverse=False)
            self.unique_msgs = dict(all_msgs[:max_events])
            excluded_events = dict(all_msgs[max_events:]).keys()
            return excluded_events

    def prepare_sequences(self, path_to_input, use_dict=True):
        """
        Dumps pickle for sequences and dictionary
        :param data_frame:
        :param file_name:
        :param index:
        :param classification_event:
        :return:
        """
        print("Preparing sequential data...")
        with open(path_to_input, "rb") as f:
            result = []
            for line in f:
                entities = line.split(',')
                if use_dict:
                    result.append([
                        int(e.strip()) for e in entities
                        if int(e.strip()) in self.unique_msgs.values()
                    ])
                else:
                    result.append([int(e.strip()) for e in entities])
        print("Processed {0} sequences".format(len(result)))
        return result

    def get_vocab_size(self):
        return len(self.unique_msgs)

    def get_ent_dict(self):
        return self.ent_dict

    def get_rel_dict(self):
        return self.rel_dict

    def get_kg(self):
        return self.g

    def get_unique_msgs(self):
        return self.unique_msgs

    def get_merged(self):
        return self.merged
Ejemplo n.º 51
0
Archivo: rdfdb.py Proyecto: t00m/KB4IT
class KB4ITGraph:
    """
    This class creates a RDF graph based on attributes for each doc.
    Also it has convenient function to ask the graph
    """
    def __init__(self, path=None):
        """
        If not path is passed it build a graph in memory. Otherwise, it
        creates a persistent graph in disk.
        """
        if path is not None:
            # Create persistent Graph in disk
            self.path = path
            self.graph = ConjunctiveGraph('Sleepycat', URIRef("kb4it://"))
            graph_path = path + SEP + 'kb4it.graph'
            self.graph.store.open(graph_path)
        else:
            # Create Graph in Memory
            self.graph = ConjunctiveGraph('IOMemory')

        # Assign namespaces to the Namespace Manager of this graph
        namespace_manager = NamespaceManager(ConjunctiveGraph())
        for ns in NSBINDINGS:
            namespace_manager.bind(ns, NSBINDINGS[ns])
        self.graph.namespace_manager = namespace_manager


    def __uniq_sort(self, result):
        alist = list(result)
        aset = set(alist)
        alist = list(aset)
        alist.sort()
        return alist


    def subjects(self, predicate, object):
        """
        Returns a list of sorted and uniques subjects given a predicate
        and an object.
        """
        return self.__uniq_sort(self.graph.subjects(predicate, object))


    def predicates(self, subject=None, object=None):
        """
        Returns a list of sorted and uniques predicates given a subject
        and an object.
        """
        return self.__uniq_sort(self.graph.predicates(subject, object))


    def objects(self, subject, predicate):
        """
        Returns a list of sorted and uniques objects given a subject
        and an predicate.
        """
        return self.__uniq_sort(self.graph.objects(subject, predicate))


    def value(self, subject=None, predicate=None, object=None, default=None, any=True):
        """
        Returns a value given the subject and the predicate.
        """
        return self.graph.value(subject, predicate, object, default, any)


    def add_document(self, doc):
        """
        Add a new document to the graph.
        """
        subject = URIRef(doc)
        predicate = RDF['type']
        object = URIRef(KB4IT['Document'])
        self.graph.add([subject, predicate, object])


    def add_document_attribute(self, doc, attribute, value):
        """
        Add a new attribute to a document
        """
        predicate = 'has%s' % attribute
        subject = URIRef(doc)
        predicate = KB4IT[predicate]
        object = Literal(value)
        self.graph.add([subject, predicate, object])


    def get_attributes(self):
        """
        Get all predicates except RFD.type and Title
        """
        blacklist = set()
        blacklist.add(RDF['type'])
        blacklist.add(KB4IT['hasTitle'])
        alist = list(self.graph.predicates(None, None))
        aset = set(alist) - blacklist
        alist = list(aset)
        alist.sort()
        return alist


    def serialize(self):
        """
        Serialize graph to pretty xml format
        """
        return self.graph.serialize(format='pretty-xml')


    def close(self):
        """
        Close the graph if it is persistent.
        FIXME: check if it is open
        """
        self.graph.store.close()