class PropertiesAll(RDFStatInterface): """count all properties""" def __init__(self, results): super(PropertiesAll, self).__init__(results) self.histogram = self.results['histogram'] = {} self.distinct = self.results['distinct'] = {} self.distinct_subject = self.results['distinct_subject'] = {} self.distinct_object = self.results['distinct_object'] = {} self.distinct_seen = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable def count(self, s, p, o, s_blank, o_l, o_blank, statement): # count all properties self.histogram[p] = self.histogram.get(p, 0) + 1 # distinct spo = s+p+o if not dh.query_distinct_spo(spo, 0): dh.set_distinct_spo(spo, 0) self.distinct[p] = self.distinct.get(p, 0) + 1 # per subject sp = s+p if len(sp) > 16: sp_hash = hashlib.md5(sp).digest() else: sp_hash = sp if not self.distinct_seen.has_key(sp_hash): self.distinct_seen[sp_hash] = 1 self.distinct_subject[p] = self.distinct_subject.get(p, 0) + 1 # per object po = p+o if len(po) > 16: po_hash = hashlib.md5(po).digest() else: po_hash = po if not self.distinct_seen.has_key(po_hash): self.distinct_seen[po_hash] = 1 self.distinct_object[p] = self.distinct_object.get(p, 0) + 1 def voidify(self, void_model, dataset): # count result_node = RDF.Node(literal=str(len(self.histogram)), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(dataset, ns_void.properties, result_node)) # property partition for property_uri,result in self.distinct.iteritems(): pr_id = RDF.Node() void_model.append(RDF.Statement(dataset, ns_void.propertyPartition, pr_id)) void_model.append(RDF.Statement(pr_id, ns_void.property, RDF.Uri(property_uri))) result_node = RDF.Node(literal=str(result), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(pr_id, ns_void.triples, result_node)) if self.distinct_subject.has_key(property_uri): s_result = self.distinct_subject[property_uri] result_node = RDF.Node(literal=str(s_result), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(pr_id, ns_void.distinctSubjects, result_node)) if self.distinct_object.has_key(property_uri): o_result = self.distinct_object[property_uri] result_node = RDF.Node(literal=str(o_result), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(pr_id, ns_void.distinctObjects, result_node)) def sparql(self, endpoint): pass
def __init__(self, results): super(PropertiesAll, self).__init__(results) self.histogram = self.results['histogram'] = {} self.distinct = self.results['distinct'] = {} self.distinct_subject = self.results['distinct_subject'] = {} self.distinct_object = self.results['distinct_object'] = {} self.distinct_seen = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable
class PropertiesAll(RDFStatInterface): """count all properties""" def __init__(self, results): super(PropertiesAll, self).__init__(results) self.histogram = self.results['histogram'] = {} self.distinct = self.results['distinct'] = {} self.distinct_subject = self.results['distinct_subject'] = {} self.distinct_object = self.results['distinct_object'] = {} self.min_value = self.results['min_value'] = {} self.max_value = self.results['max_value'] = {} self.distinct_seen = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable def count(self, s, p, o, s_blank, o_l, o_blank, statement): # count all properties self.histogram[p] = self.histogram.get(p, 0) + 1 # distinct spo = s+p+o if not dh.query_distinct_spo(spo, 0): dh.set_distinct_spo(spo, 0) self.distinct[p] = self.distinct.get(p, 0) + 1 # per subject sp = s+p if len(sp) > 16: sp_hash = hashlib.md5(sp).digest() else: sp_hash = sp if not self.distinct_seen.has_key(sp_hash): self.distinct_seen[sp_hash] = 1 self.distinct_subject[p] = self.distinct_subject.get(p, 0) + 1 if o_l: value = None if str(statement.object.literal[2]) in [str(ns_xs.decimal), str(ns_xs.float), str(ns_xs.double)] or \ p in ['http://www.w3.org/2003/01/geo/wgs84_pos#long', 'http://www.w3.org/2003/01/geo/wgs84_pos#lat', 'http://www.w3.org/2003/01/geo/wgs84_pos#alt']: value = float(o) elif str(statement.object.literal[2]) in [str(ns_xs.int), str(ns_xs.integer)]: value = int(o) elif str(statement.object.literal[2]) in [str(ns_xs.dateTime), str(ns_xs.date)]: value = o if value is not None: if self.min_value.has_key(p): self.min_value[p] = min(self.min_value[p], value) else: self.min_value[p] = value if self.max_value.has_key(p): self.max_value[p] = max(self.max_value[p], value) else: self.max_value[p] = value # per object po = p+o if len(po) > 16: po_hash = hashlib.md5(po).digest() else: po_hash = po if not self.distinct_seen.has_key(po_hash): self.distinct_seen[po_hash] = 1 self.distinct_object[p] = self.distinct_object.get(p, 0) + 1 def voidify(self, void_model, dataset): # count result_node = RDF.Node(literal=str(len(self.histogram)), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(dataset, ns_void.properties, result_node)) # property partition for property_uri,result in self.distinct.iteritems(): pr_id = RDF.Node() void_model.append(RDF.Statement(dataset, ns_void.propertyPartition, pr_id)) void_model.append(RDF.Statement(pr_id, ns_void.property, RDF.Uri(property_uri))) result_node = RDF.Node(literal=str(result), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(pr_id, ns_void.triples, result_node)) if self.distinct_subject.has_key(property_uri): s_result = self.distinct_subject[property_uri] result_node = RDF.Node(literal=str(s_result), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(pr_id, ns_void.distinctSubjects, result_node)) if self.distinct_object.has_key(property_uri): o_result = self.distinct_object[property_uri] result_node = RDF.Node(literal=str(o_result), datatype=ns_xs.integer.uri) void_model.append(RDF.Statement(pr_id, ns_void.distinctObjects, result_node)) if self.min_value.has_key(property_uri): min_value = self.min_value[property_uri] if isinstance(min_value, float): result_node = RDF.Node(literal=str(min_value), datatype=ns_xs.decimal.uri) elif isinstance(min_value, int): result_node = RDF.Node(literal=str(min_value), datatype=ns_xs.integer.uri) elif isinstance(min_value, str): result_node = RDF.Node(literal=min_value, datatype=ns_xs.dateTime.uri) void_model.append(RDF.Statement(pr_id, ns_dstats.minValue, result_node)) if self.max_value.has_key(property_uri): max_value = self.max_value[property_uri] if isinstance(max_value, float): result_node = RDF.Node(literal=str(max_value), datatype=ns_xs.decimal.uri) elif isinstance(max_value, int): result_node = RDF.Node(literal=str(max_value), datatype=ns_xs.integer.uri) elif isinstance(max_value, str): result_node = RDF.Node(literal=max_value, datatype=ns_xs.dateTime.uri) void_model.append(RDF.Statement(pr_id, ns_dstats.maxValue, result_node)) def sparql(self, endpoint): pass
LODStats is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with LODStats. If not, see <http://www.gnu.org/licenses/>. """ import bitarray from hashlib import md5 from LimitedSizeDict import LimitedSizeDict # FIXME: does it help to build some small "md5-cache" for the last 1, 2, 3 strings?! # subjects distinct_subjects = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable # 0: entities, 1: typed subjects, 2: labeled subjects def query_distinct_subject(s, num_id): if len(s) > 16: s_hash = md5(s).digest() else: s_hash = s if distinct_subjects.has_key(s_hash): return distinct_subjects[s_hash][num_id] else: return False def set_distinct_subject(s, num_id): if len(s) > 16: s_hash = md5(s).digest()