Ejemplo n.º 1
0
def compute_dataset(dataset):
    if dataset not in dataset_cache:
        g = get_eml(dataset)
        g_classes, local_idf, lsa = vectorize_ontology(g, idf, lsa_model)
        g.classes = g_classes
        dataset_cache[dataset] = g
    return dataset_cache[dataset]
def compute_dataset(dataset):
    if dataset not in dataset_cache:
        g = get_eml(dataset)
        g_classes, local_idf, lsa = vectorize_ontology(g, idf, lsa_model)
        g.classes = g_classes
        dataset_cache[dataset] = g
    return dataset_cache[dataset]
Ejemplo n.º 3
0
        def view(name=None):
            print name

            content_type = request.headers[
                'Content-Type'] if 'Content-Type' in request.headers else '*/*'

            source_graph = get_eml(name)
            classes, local_idf = vectorize_ontology(source_graph, self.idf)
            source_graph.classes = classes
            source_classes = source_graph.classes
            source_subtree = set(
                source_graph.transitive_subjects(self.NS.RDFS.subClassOf,
                                                 self.NS.oboe.MeasurementType))
            source_class_subtree = [
                x for x in source_classes if x.identifier in source_subtree
                and x.identifier != self.NS.oboe.MeasurementType
            ]
            sources = dict([(x.identifier, x) for x in source_class_subtree])

            distances = pairwise_sparsedist(source_class_subtree,
                                            self.target_class_subtree)

            result = rdflib.Graph()
            dataset = result.resource(self.NS.dataset[name])
            dataset.add(self.NS.RDF.type, self.NS.dcat.Dataset)
            for c, dist in distances.items():
                source_class = source_graph.resource(c)
                attr_id = source_class.value(self.urn.attributeId)
                entity_id = source_class.value(self.urn.entityId)
                selector = "xpointer(/eml/dataset/dataTable[%s]/attributeList/attribute[%s])" % (
                    entity_id, attr_id)
                attribute = result.resource(dataset.identifier + '#' +
                                            selector)
                attribute.add(self.NS.RDF.type, self.NS.csvw.Column)
                sel = result.resource(rdflib.BNode())
                attribute.add(self.NS.oa.hasSelector, sel)
                sel.add(self.NS.RDF.type, self.NS.oa.FragmentSelector)
                sel.add(self.NS.dcterms.conformsTo,
                        rdflib.URIRef("http://tools.ietf.org/rfc/rfc3023"))
                sel.add(self.NS.RDF.value, Literal(selector))
                attribute.add(self.NS.RDFS.label, source_class.label())

                for target, score in sorted(
                        dist.items(),
                        key=lambda x: x[1])[:self.config['top_hits']]:
                    if score < self.config['max_distance']:
                        attribute.add(self.NS.dcterms.subject, target)
                        result.add((target, self.NS.RDFS.label,
                                    self.target_graph.label(target)))

            return sadi.serialize(result, accept=content_type)
Ejemplo n.º 4
0
        def view(name=None):
            print name
            
            content_type = request.headers['Content-Type'] if 'Content-Type' in request.headers else '*/*'

            source_graph = get_eml(name)
            classes, local_idf = vectorize_ontology(source_graph, self.idf)
            source_graph.classes = classes
            source_classes = source_graph.classes
            source_subtree = set(source_graph.transitive_subjects(self.NS.RDFS.subClassOf, self.NS.oboe.MeasurementType))
            source_class_subtree = [x for x in source_classes
                                    if x.identifier in source_subtree and x.identifier != self.NS.oboe.MeasurementType]
            sources = dict([(x.identifier, x) for x in source_class_subtree])

            distances = pairwise_sparsedist(source_class_subtree, self.target_class_subtree)

            result = rdflib.Graph()
            dataset = result.resource(self.NS.dataset[name])
            dataset.add(self.NS.RDF.type, self.NS.dcat.Dataset)
            for c, dist in distances.items():
                source_class = source_graph.resource(c)
                attr_id = source_class.value(self.urn.attributeId)
                entity_id = source_class.value(self.urn.entityId)
                selector = "xpointer(/eml/dataset/dataTable[%s]/attributeList/attribute[%s])" % (entity_id, attr_id)
                attribute = result.resource(dataset.identifier+'#'+selector)
                attribute.add(self.NS.RDF.type, self.NS.csvw.Column)
                sel = result.resource(rdflib.BNode())
                attribute.add(self.NS.oa.hasSelector, sel)
                sel.add(self.NS.RDF.type, self.NS.oa.FragmentSelector)
                sel.add(self.NS.dcterms.conformsTo, rdflib.URIRef("http://tools.ietf.org/rfc/rfc3023"))
                sel.add(self.NS.RDF.value, Literal(selector))
                attribute.add(self.NS.RDFS.label, source_class.label())
                
                for target, score in sorted(dist.items(), key=lambda x: x[1])[:self.config['top_hits']]:
                    if score < self.config['max_distance']:
                        attribute.add(self.NS.dcterms.subject, target)
                        result.add((target, self.NS.RDFS.label, self.target_graph.label(target)))
            
            return sadi.serialize(result, accept=content_type)