def PredicateJoinCount(cursor, graph, realTables, joinField1, joinField2): d = {} #predField = 'predicate' #tableName = graph.store._internedId + '_all' for table1 in realTables: t1 = realTables[table1] if t1.columnNames[PREDICATE] == None: predStr = t1.hardCodedResultFields[PREDICATE] pred1 = normalizeValue(predStr,'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) else: pred1 = "t1.%s"%(t1.columnNames[PREDICATE]) for table2 in realTables: t2 = realTables[table2] if t2.columnNames[PREDICATE] == None: predStr = t2.hardCodedResultFields[PREDICATE] pred2 = normalizeValue(predStr,'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) else: pred2 = "t2.%s"%(t2.columnNames[PREDICATE]) cursor.execute(""" SELECT %s AS pred1, %s AS pred2, COUNT(*) as tupleCount FROM %s t1, %s t2 WHERE t1.%s = t2.%s GROUP BY pred1, pred2 """%(pred1, pred2, t1, t2, t1.columnNames[joinField1], t2.columnNames[joinField2])) for (p1,p2,tupleCount) in cursor.fetchall(): d[(p1,p2)] = tupleCount print(' Entries for join type (%s-%s) index: %s' % (joinField1, joinField2, len(d))) return d
def viewUnionSelectExpression(self, relations_only=False): """ Return a SQL statement which creates a view of all the RDF statements from all the contributing partitions """ rt = [] if relations_only and self.objectPropertyTable: return "select * from %s" % repr(self) if self.useSignedInts: int_cast = 'BIGINT' else: int_cast = 'UNSIGNED BIGINT' for idx in range(len(POSITION_LIST)): rdfTermLabel = SlotPrefixes[idx] if idx < len(self.columnNames) and self.columnNames[idx]: #there is a matching column rt.append(self.columnNames[idx] + ' as %s' % rdfTermLabel) if self.termEnumerations[idx]: #there is a corresponding term enumeration rt.append(self.columnNames[idx] + '_term as %s_term' % (rdfTermLabel, )) else: #no corresponding term enumeration (hardcoded) rt.append( "CAST('%s' as CHAR) as %s_term" % (self.hardCodedResultTermsTypes[idx], rdfTermLabel)) else: assert self.hardCodedResultFields[idx] == RDF.type if not self.store.can_cast_bigint: rt.append( "%s as %s" % (normalizeValue(self.hardCodedResultFields[idx], 'U', self.useSignedInts), rdfTermLabel)) else: rt.append("CAST('%s' as %s) as %s" % (normalizeValue( self.hardCodedResultFields[idx], 'U', self.useSignedInts), int_cast, rdfTermLabel)) if self.hardCodedResultTermsTypes[idx]: rt.append( "CAST('%s' as CHAR) as %s_term" % (self.hardCodedResultTermsTypes[idx], rdfTermLabel)) if not relations_only: if self.literalTable: for i in self.columnNames[-2:]: rt.append(i[0]) else: if not self.store.can_cast_bigint: rt.append('NULL as data_type') else: rt.append('CAST(NULL as %s) as data_type' % (int_cast, )) rt.append('CAST(NULL as char(3)) as language') return "select %s from %s" % (', '.join(rt), repr(self))
def viewUnionSelectExpression(self,relations_only=False): """ Return a SQL statement which creates a view of all the RDF statements from all the contributing partitions """ rt=[] if relations_only and self.objectPropertyTable: return "select * from %s"%repr(self) if self.useSignedInts: int_cast = 'BIGINT' else: int_cast = 'UNSIGNED BIGINT' for idx in range(len(POSITION_LIST)): rdfTermLabel=SlotPrefixes[idx] if idx < len(self.columnNames) and self.columnNames[idx]: #there is a matching column rt.append(self.columnNames[idx]+' as %s'%rdfTermLabel) if self.termEnumerations[idx]: #there is a corresponding term enumeration rt.append(self.columnNames[idx] + '_term as %s_term' % (rdfTermLabel,)) else: #no corresponding term enumeration (hardcoded) rt.append("CAST('%s' as CHAR) as %s_term" % (self.hardCodedResultTermsTypes[idx], rdfTermLabel)) else: assert self.hardCodedResultFields[idx] == RDF.type if not self.store.can_cast_bigint: rt.append("%s as %s" % (normalizeValue( self.hardCodedResultFields[idx], 'U', self.useSignedInts), rdfTermLabel)) else: rt.append("CAST('%s' as %s) as %s" % (normalizeValue( self.hardCodedResultFields[idx], 'U', self.useSignedInts), int_cast, rdfTermLabel)) if self.hardCodedResultTermsTypes[idx]: rt.append("CAST('%s' as CHAR) as %s_term" % (self.hardCodedResultTermsTypes[idx], rdfTermLabel)) if not relations_only: if self.literalTable: for i in self.columnNames[-2:]: rt.append(i[0]) else: if not self.store.can_cast_bigint: rt.append('NULL as data_type') else: rt.append('CAST(NULL as %s) as data_type' % (int_cast,)) rt.append('CAST(NULL as char(3)) as language') return "select %s from %s"%(', '.join(rt),repr(self))
def GetCachedStats(graph, cacheFolder, alwaysGen=False, genMissing=True, doJoins=False): fileName = os.path.join(cacheFolder, graph.store.identifier + "-" + str( normalizeValue(graph.store.configuration, "L"))) + ".cache" version = "0.1" genStats = genMissing stats = LoadCachedStats(fileName, version) if stats != None: genStats = alwaysGen if genStats: print('Generating data statistics...') startTime = time.time() stats = GetDatabaseStats(graph, stats, doJoins) # update stats print(' done in %s s' % (time.time()-startTime)) # save stats to disk print('Saving data statistics...') startTime = time.time() f = open(fileName, 'w') cPickle.dump(version, f) cPickle.dump(stats, f) f.close() #print ' done in %s s' % (time.time()-startTime) return stats
def CountDistinctForColumn(cursor, table, mainColumn, countColumn): d = {} if table.columnNames[mainColumn] == None: pred = table.hardCodedResultFields[mainColumn] predInt = normalizeValue( pred, 'U' ) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) cursor.execute(""" SELECT COUNT(DISTINCT %s) AS objCount FROM %s;""" % (table.columnNames[countColumn], table)) for (objCount) in cursor.fetchall(): d[predInt] = objCount else: cursor.execute( """ SELECT %s AS pred, COUNT(DISTINCT %s) AS objCount FROM %s GROUP BY %s;""" % (table.columnNames[mainColumn], table.columnNames[countColumn], table, table.columnNames[mainColumn])) for (pred, objCount) in cursor.fetchall(): d[pred] = objCount print(' Distinct value entries in %s for column %s: %s' % (countColumn, mainColumn, len(d))) return d
def makeRowComponents(self, quadSlots): subjSlot, predSlot, objSlot, conSlot = quadSlots dTypeParam = objSlot.term.datatype and normalizeValue( objSlot.term.datatype, 'U', self.useSignedInts) or None langParam = objSlot.term.language and objSlot.term.language or None return (subjSlot.md5Int, subjSlot.termType, predSlot.md5Int, predSlot.termType, objSlot.md5Int, conSlot.md5Int, conSlot.termType, dTypeParam, langParam)
def defaultStatements(self): """ Since rdf:type is modeled explicitely (in the ABOX partition) it must be inserted as a 'default' identifier. """ return ["INSERT INTO %s VALUES (%s, 'U', '%s');" % (self, normalizeValue(RDF.type, 'U', self.useSignedInts), RDF.type)]
def PredicateJoinCount(cursor, graph, realTables, joinField1, joinField2): d = {} #predField = 'predicate' #tableName = graph.store._internedId + '_all' for table1 in realTables: t1 = realTables[table1] if t1.columnNames[PREDICATE] == None: predStr = t1.hardCodedResultFields[PREDICATE] pred1 = normalizeValue( predStr, 'U' ) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) else: pred1 = "t1.%s" % (t1.columnNames[PREDICATE]) for table2 in realTables: t2 = realTables[table2] if t2.columnNames[PREDICATE] == None: predStr = t2.hardCodedResultFields[PREDICATE] pred2 = normalizeValue( predStr, 'U' ) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) else: pred2 = "t2.%s" % (t2.columnNames[PREDICATE]) cursor.execute(""" SELECT %s AS pred1, %s AS pred2, COUNT(*) as tupleCount FROM %s t1, %s t2 WHERE t1.%s = t2.%s GROUP BY pred1, pred2 """ % (pred1, pred2, t1, t2, t1.columnNames[joinField1], t2.columnNames[joinField2])) for (p1, p2, tupleCount) in cursor.fetchall(): d[(p1, p2)] = tupleCount print(' Entries for join type (%s-%s) index: %s' % (joinField1, joinField2, len(d))) return d
def compileQuadToParams(self, quadSlots): subjSlot, predSlot, objSlot, conSlot = quadSlots dTypeParam = objSlot.term.datatype and normalizeValue( objSlot.term.datatype, 'U', self.useSignedInts) or None langParam = objSlot.term.language and objSlot.term.language or None rtList = [ subjSlot.md5Int, term2Letter(subjSlot.term), predSlot.md5Int, term2Letter(predSlot.term), objSlot.md5Int, conSlot.md5Int, term2Letter(conSlot.term) ] for item in [dTypeParam, langParam]: if item: rtList.append(item) return tuple(rtList)
def compileQuadToParams(self,quadSlots): subjSlot,predSlot,objSlot,conSlot = quadSlots dTypeParam = objSlot.term.datatype and normalizeValue( objSlot.term.datatype, 'U', self.useSignedInts) or None langParam = objSlot.term.language and objSlot.term.language or None rtList = [ subjSlot.md5Int, term2Letter(subjSlot.term), predSlot.md5Int, term2Letter(predSlot.term), objSlot.md5Int, conSlot.md5Int, term2Letter(conSlot.term)] for item in [dTypeParam,langParam]: if item: rtList.append(item) return tuple(rtList)
def CountDistinctForColumn(cursor, table, mainColumn, countColumn): d = {} if table.columnNames[mainColumn] == None: pred = table.hardCodedResultFields[mainColumn] predInt = normalizeValue(pred,'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) cursor.execute(""" SELECT COUNT(DISTINCT %s) AS objCount FROM %s;""" % (table.columnNames[countColumn], table)) for (objCount) in cursor.fetchall(): d[predInt] = objCount else: cursor.execute(""" SELECT %s AS pred, COUNT(DISTINCT %s) AS objCount FROM %s GROUP BY %s;""" % (table.columnNames[mainColumn], table.columnNames[countColumn], table, table.columnNames[mainColumn])) for (pred,objCount) in cursor.fetchall(): d[pred] = objCount print(' Distinct value entries in %s for column %s: %s' % (countColumn, mainColumn, len(d))) return d
def GetCachedStats(graph, cacheFolder): fileName = os.path.join(cacheFolder, graph.store.identifier + "-" + \ str(normalizeValue(graph.store.configuration, "L"))) + ".cache" version = "0.1" genStats = True if os.path.exists(fileName): print('Reloading data statistics from cache file...') # reload previous created data stats cache file f = open(fileName, 'r') loadVersion = cPickle.load(f) if (version == loadVersion): startTime = time.time() stats = cPickle.load(f) genStats = False f.close() print(' done in %s s' % (time.time() - startTime)) else: f.close() print('Saved statistics in wrong version! Must be re-generated.') os.remove(fileName) if genStats: print('Generating data statistics...') startTime = time.time() stats = GetDatabaseStats(graph) print(' done in %s s' % (time.time() - startTime)) # save stats to disk print('Saving data statistics...') startTime = time.time() f = open(fileName, 'w') cPickle.dump(version, f) cPickle.dump(stats, f) f.close() #print(' done in %s s' % (time.time()-startTime)) return stats
def GetCachedStats(graph, cacheFolder): fileName = os.path.join(cacheFolder, graph.store.identifier + "-" + \ str(normalizeValue(graph.store.configuration, "L"))) + ".cache" version = "0.1" genStats = True if os.path.exists(fileName): print('Reloading data statistics from cache file...') # reload previous created data stats cache file f = open(fileName, 'r') loadVersion = cPickle.load(f) if (version == loadVersion): startTime = time.time() stats = cPickle.load(f) genStats = False f.close() print(' done in %s s' % (time.time()-startTime)) else: f.close() print('Saved statistics in wrong version! Must be re-generated.') os.remove(fileName) if genStats: print('Generating data statistics...') startTime = time.time() stats = GetDatabaseStats(graph) print(' done in %s s' % (time.time()-startTime)) # save stats to disk print('Saving data statistics...') startTime = time.time() f = open(fileName, 'w') cPickle.dump(version, f) cPickle.dump(stats, f) f.close() #print(' done in %s s' % (time.time()-startTime)) return stats
def test_dType_encoding(): # storetest = True # correct = normalizeValue('http://www.w3.org/2001/XMLSchema#integer', 'U') wrong = normalizeValue('http://www.w3.org/2001/XMLSchema#integer', 'L') store = plugin.get('MySQL', Store)() store.destroy(configString) store.open(configString, create=True) Graph(store).add((BNode(), URIRef('foo'), Literal(1))) db = store._db cursor = db.cursor() cursor.execute("select * from %s where data_type = '%s'" % (store.literalProperties, wrong)) assert not cursor.fetchone(), "Datatype encoding bug!" # for suffix,(relations_only,tables) in store.viewCreationDict.items(): # query='create view %s%s as %s'%(store._internedId, # suffix, # ' union all '.join([t.viewUnionSelectExpression(relations_only) # for t in tables])) # # print "## Creating View ##\n",query store.rollback() store.destroy(configString) store.close()
def GetDatabaseStats(graph): print('Gathering statistics...') startTime = time.time() stats = dict() stats['triples'] = 0 #len(graph) #ISSUE: len(graph) only gives count for default graph??? stats['cacheName'] = graph.store.identifier + "-" + \ str(normalizeValue(graph.store.configuration, "L")) stats['storeName'] = graph.store.identifier stats['internedId'] = graph.store._internedId stats['config'] = graph.store.configuration tables = dict(type = graph.store.aboxAssertions, lit = graph.store.literalProperties, rel = graph.store.binaryRelations, all = graph.store._internedId + '_all') # FIXME Unused code realTables = dict(type = graph.store.aboxAssertions, lit = graph.store.literalProperties, rel = graph.store.binaryRelations) # columnNames[OBJECT] cursor = graph.store._db.cursor() # distinct num. of subjects, predicates, & objects tableType = 'all' statStartTime = time.time() stats['subjects'] = CountDistint(cursor, tables[tableType], 'subject') stats['predicates'] = CountDistint(cursor, tables[tableType], 'predicate') stats['objects'] = CountDistint(cursor, tables[tableType], 'object') stats['distTime'] = time.time()-statStartTime for tableType in ['lit', 'rel', 'type']: table = tables[tableType] # total # triples cursor.execute(""" SELECT COUNT(*) FROM %s """ % table) triples = cursor.fetchone()[0] stats[tableType + '_triples'] = triples stats['triples'] = stats['triples'] + triples print(' Processing table %s: %s triples...' %(tableType,triples)) # distinct num. of subjects, predicates, & objects statStartTime = time.time() stats[tableType + '_subjects'] = CountDistint(cursor, table, table.columnNames[SUBJECT]) stats[tableType + '_predicates'] = CountDistint(cursor, table, table.columnNames[PREDICATE]) stats[tableType + '_objects'] = CountDistint(cursor, table, table.columnNames[OBJECT]) stats[tableType + '_distTime'] = time.time()-statStartTime # subject/object counts for predicates statStartTime = time.time() stats[tableType + '_colDist'] = {} stats[tableType + '_colDist']['obj_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, OBJECT) stats[tableType + '_colDist']['sub_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, SUBJECT) stats[tableType + '_colDistTime'] = time.time()-statStartTime # triple pattern occurrence counts statStartTime = time.time() stats[tableType + '_pat'] = {} stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)] = CountTriples(cursor, table, [SUBJECT], [PREDICATE,OBJECT]) stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)] = CountTriples(cursor, table, [PREDICATE], [SUBJECT,OBJECT]) stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)] = CountTriples(cursor, table, [OBJECT], [SUBJECT,PREDICATE]) stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)] = CountTriples(cursor, table, [SUBJECT,PREDICATE], [OBJECT]) stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)] = CountTriples(cursor, table, [PREDICATE, OBJECT], [SUBJECT]) stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)] = CountTriples(cursor, table, [SUBJECT, OBJECT], [PREDICATE]) stats[tableType + '_patTime'] = time.time()-statStartTime # predicate co-occurrence statStartTime = time.time() # stats['join_s-s'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, SUBJECT) # stats['join_s-o'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, OBJECT) # stats['join_o-s'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, SUBJECT) # stats['join_o-o'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, OBJECT) stats['joinTime'] = time.time()-statStartTime cursor.close() endTime = time.time()-startTime print('Statistics gathered in %s ms' % (endTime)) stats['elapsedTime'] = endTime return stats
def CountTriples(cursor, table, specifiedColumns, variableColumns): d = {} specCols = [] hardCodedSpecCols = [] varCols = [] # hardCodedVarCols = [] # not needed indexPos = {} for i in specifiedColumns: if table.columnNames[i] != None: indexPos[i] = ('spec', len(specCols)) specCols.append(table.columnNames[i]) else: indexPos[i] = ('hard', len(hardCodedSpecCols)) hardCodedSpecCols.append( normalizeValue(table.hardCodedResultFields[i], 'U') ) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) for i in variableColumns: indexPos[i] = ('var', -1) if table.columnNames[i] != None: varCols.append(table.columnNames[i]) # else # hardCodedVarCols.append(normalizeValue(table.hardCodedResultFields[i],'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) #Assumes column lists in (s,p,o) order if len(specCols) == 0: cursor.execute(""" SELECT COUNT(*) AS tripleCount FROM %s;""" % (table)) if len(specCols) == 1: cursor.execute(""" SELECT %s AS givenCol, COUNT(*) AS tripleCount FROM %s GROUP BY %s;""" % (specCols[0], table, specCols[0])) # for (givenCol,tripleCount) in cursor.fetchall(): # d['%s=%s'%(specCols[0],givenCol) ] = tripleCount elif len(specCols) == 2: cursor.execute( """ SELECT %s AS givenCol1, %s AS givenCol2, COUNT(*) AS tripleCount FROM %s GROUP BY %s, %s;""" % (specCols[0], specCols[1], table, specCols[0], specCols[1])) # for (givenCol,tripleCount) in cursor.fetchall(): # d['%s_triples_%s=%s'%(table,givenCols[0],givenCol) ] = tripleCount elif len(specCols) == 3: cursor.execute(""" SELECT %s AS givenCol1, %s AS givenCol2, %s AS givenCol3 COUNT(*) AS tripleCount FROM %s GROUP BY %s, %s, %s;""" % (specCols[0], specCols[1], specCols[2], table, specCols[0], specCols[1], specCols[2])) for t in cursor.fetchall(): key = [] for i in (SUBJECT, PREDICATE, OBJECT, CONTEXT): if indexPos.has_key(i): (type, pos) = indexPos[i] if type == 'spec': key.append('%s=%s' % (i, t[pos])) elif type == 'hard': key.append('%s=%s' % (i, hardCodedSpecCols[pos])) d[','.join(key)] = t[len(t) - 1] names = [] for i in (SUBJECT, PREDICATE, OBJECT, CONTEXT): if indexPos.has_key(i): (type, pos) = indexPos[i] if type == 'var': names.append('?') elif type == 'spec' or type == 'hard': names.append(str(i)) print(' Entries for triple pattern (%s) index: %s' % (','.join(names), len(d))) return d
def GetDatabaseStats(store, stats=None, doJoins=False): print('Gathering statistics...') startTime = time.time() if stats is None: stats = dict() stats['triples'] = 0 stats['cacheName'] = store.identifier + "-" + str( normalizeValue(store.configuration, "L")) stats['storeName'] = store.identifier stats['internedId'] = store._internedId stats['config'] = store.configuration tables = dict(type = store.aboxAssertions, lit = store.literalProperties, rel = store.binaryRelations, all = store._internedId + '_all') realTables = dict(type = store.aboxAssertions, lit = store.literalProperties, rel = store.binaryRelations) cursor = store._db.cursor() # distinct num. of subjects, predicates, & objects (NOTE: we always want these!) statStartTime = time.time() stats['subjects'] = CountDistint(cursor, tables['all'], 'subject') stats['predicates'] = CountDistint(cursor, tables['all'], 'predicate') stats['objects'] = CountDistint(cursor, tables['all'], 'object') stats['distTime'] = time.time()-statStartTime if not stats.has_key('colDistTime'): stats['colDistTime'] = 0 stats['(%s,?,?)_patTime' % (SUBJECT)] = 0 stats['(?,%s,?)_patTime' % (PREDICATE)] = 0 stats['(?,?,%s)_patTime' % (OBJECT)] = 0 stats['(%s,%s,?)_patTime' % (SUBJECT, PREDICATE)] = 0 stats['(?,%s,%s)_patTime' % (PREDICATE, OBJECT)] = 0 stats['(%s,?,%s)_patTime' % (SUBJECT, OBJECT)] = 0 tableType = 'lit' for h in histogramSizes: if not stats.has_key(tableType + '_pat') \ or not stats[tableType + '_pat'].has_key( '(%sh%s,?,?)' % (SUBJECT, h)): stats['(%sh%s,?,?)_patTime' % (SUBJECT, h)] = 0 stats['(?,?,%sh%s)_patTime' % (OBJECT,h)] = 0 stats['(?,%s,%sh%s)_patTime' % (PREDICATE, OBJECT, h)] = 0 for tableType in ['lit', 'rel', 'type']: table = tables[tableType] # Statistics on ENTIRE DATABASE (completely unspecified triple pattern) # total # triples (NOTE: we always want these!) if not stats.has_key(tableType + '_triples'): cursor.execute(""" SELECT COUNT(*) FROM %s """ % table) triples = cursor.fetchone()[0] stats[tableType + '_triples'] = triples print ' Processing table %s: %s triples...' % ( tableType, triples) stats['triples'] += stats[tableType + '_triples'] # distinct num. of subjects, predicates, & objects if not stats.has_key(tableType + '_subjects'): statStartTime = time.time() stats[tableType + '_subjects'] = CountDistint(cursor, table, table.columnNames[SUBJECT]) stats[tableType + '_predicates'] = CountDistint(cursor, table, table.columnNames[PREDICATE]) stats[tableType + '_objects'] = CountDistint(cursor, table, table.columnNames[OBJECT]) stats[tableType + '_distTime'] = time.time()-statStartTime # subject/object counts for predicates (NOTE: used for greedy ordering algorithm; some cost formulas; always want) #if not stats.has_key(tableType + '_colDist'): statStartTime = time.time() stats[tableType + '_colDist'] = {} stats[tableType + '_colDist']['obj_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, OBJECT) stats[tableType + '_colDist']['sub_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, SUBJECT) stats[tableType + '_colDistTime'] = time.time()-statStartTime stats['colDistTime'] += stats[tableType + '_colDistTime'] # triple pattern occurrence counts (NOTE: takes too much space to store all of these! Choose wisely) if not stats.has_key(tableType + '_pat'): stats[tableType + '_pat'] = {} if not stats[tableType + '_pat'].has_key('(%s,?,?)' % (SUBJECT)): statStartTime = time.time() stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)] = CountTriples(cursor, table, [SUBJECT], [PREDICATE,OBJECT]) # may be useful if lots of queries asking for everything about a particular subject (but only if you are joining the object, etc.); suggest histogram or even average stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)] = CountTriples(cursor, table, [PREDICATE], [SUBJECT,OBJECT]) #NOTE: always wnat this!! Small and very useful! stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)] = CountTriples(cursor, table, [OBJECT], [SUBJECT,PREDICATE]) stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)] = CountTriples(cursor, table, [SUBJECT,PREDICATE], [OBJECT]) # if wanted, suggest histogram stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)] = CountTriples(cursor, table, [PREDICATE, OBJECT], [SUBJECT]) #NOTE: 2nd most useful; but needs ~ 1/3T space; suggest histogram instead # stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)] = CountTriples(cursor, table, [SUBJECT, OBJECT], [PREDICATE]) #NOTE: basically useless! stats[tableType + '_patTime'] = time.time()-statStartTime stats['(%s,?,?)_patTime' % (SUBJECT)] += stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)]['countTime'] stats['(?,%s,?)_patTime' % (PREDICATE)] += stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)]['countTime'] stats['(?,?,%s)_patTime' % (OBJECT)] += stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)]['countTime'] stats['(%s,%s,?)_patTime' % (SUBJECT, PREDICATE)] += stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)]['countTime'] stats['(?,%s,%s)_patTime' % (PREDICATE, OBJECT)] += stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)]['countTime'] stats['(%s,?,%s)_patTime' % (SUBJECT, OBJECT)] += stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)]['countTime'] # histograms (class(s),-,-), (-,-,class(o)), (-,p,class(o)) for h in histogramSizes: if not stats[tableType + '_pat'].has_key('(%sh%s,?,?)' % (SUBJECT, h)): #NOTE: if using real value for a particular triple pattern, then disable the histogram version #NOTE: can move these out of the loop and put in different histogram sizes for each type of triple pattern (& modify in the formulas) stats[tableType + '_pat']['(%sh%s,?,?)' % (SUBJECT, h)] = CountTriples(cursor, table, [SUBJECT], [PREDICATE,OBJECT], [SUBJECT], h) stats['(%sh%s,?,?)_patTime' % (SUBJECT, h)] += stats[tableType + '_pat']['(%sh%s,?,?)' % (SUBJECT, h)]['countTime'] stats[tableType + '_pat']['(?,?,%sh%s)' % (OBJECT, h)] = CountTriples(cursor, table, [OBJECT], [SUBJECT,PREDICATE], [OBJECT], h) stats['(?,?,%sh%s)_patTime' % (OBJECT,h)] += stats[tableType + '_pat']['(?,?,%sh%s)' % (OBJECT, h)]['countTime'] stats[tableType + '_pat']['(?,%s,%sh%s)' % (PREDICATE, OBJECT, h)] = CountTriples(cursor, table, [PREDICATE, OBJECT], [SUBJECT], [OBJECT], h) stats['(?,%s,%sh%s)_patTime' % (PREDICATE, OBJECT, h)] += stats[tableType + '_pat']['(?,%s,%sh%s)' % (PREDICATE, OBJECT, h)]['countTime'] #NOTE: may want to add a subject-predicate histogram here (if using the pattern frequently) # predicate co-occurrence if False:#doJoins: # Note: this is very expensive and only used by Stocker WWW2008 method! (i.e. we don't need them) if not stats.has_key('joinTime'): statStartTime = time.time() stats['join_s-s'] = PredicateJoinCount(cursor, realTables, SUBJECT, SUBJECT) stats['join_s-o'] = PredicateJoinCount(cursor, realTables, SUBJECT, OBJECT) stats['join_o-s'] = PredicateJoinCount(cursor, realTables, OBJECT, SUBJECT) stats['join_o-o'] = PredicateJoinCount(cursor, realTables, OBJECT, OBJECT) stats['joinTime'] = time.time()-statStartTime cursor.close() endTime = time.time()-startTime print('Statistics gathered in %s ms' % (endTime)) stats['elapsedTime'] = endTime return stats
def generateWhereClause(self,queryPattern): """ Takes a query pattern (a list of quad terms - subject,predicate,object,context) and generates a SQL WHERE clauses which works in conjunction to the intersections to filter the result set by partial matching (by REGEX), full matching (by integer half-hash), and term types. For maximally efficient SELECT queries """ whereClauses = [] whereParameters = [] asserted = dereferenceQuad(CONTEXT,queryPattern) is None for idx in SlotPrefixes.keys(): queryTerm = dereferenceQuad(idx,queryPattern) lookupAlias = 'rt_'+SlotPrefixes[idx] if idx == CONTEXT and asserted: whereClauses.append("%s.%s_term != 'F'" % \ (self,self.columnNames[idx])) if idx < len(POSITION_LIST) and isinstance(queryTerm,REGEXTerm): whereClauses.append("%s.lexical REGEXP "%lookupAlias+"%s") whereParameters.append(queryTerm) elif idx == CONTEXT \ and isinstance(queryTerm,Graph) \ and isinstance(queryTerm.identifier,REGEXTerm): whereClauses.append("%s.lexical REGEXP "%lookupAlias+"%s") whereParameters.append(queryTerm.identifier) elif idx < len(POSITION_LIST) and queryTerm is not Any: if self.columnNames[idx]: if isinstance(queryTerm,list): whereClauses.append("%s.%s" % \ (self,self.columnNames[idx])+" in (%s)" % \ ','.join([ '%s' for item in range(len(queryTerm)) ])) whereParameters.extend( [normalizeValue(item, term2Letter(item), self.useSignedInts) for item in queryTerm]) else: whereClauses.append("%s.%s" % \ (self,self.columnNames[idx])+" = %s") whereParameters.append(normalizeValue( queryTerm, term2Letter(queryTerm), self.useSignedInts)) if not idx in self.hardCodedResultTermsTypes \ and self.termEnumerations[idx] \ and not isinstance(queryTerm,list): whereClauses.append("%s.%s_term" % \ (self,self.columnNames[idx])+" = %s") whereParameters.append(term2Letter(queryTerm)) elif idx >= len(POSITION_LIST) \ and len(self.columnNames) > len(POSITION_LIST) \ and queryTerm is not None: compVal = idx == DATATYPE_INDEX and normalizeValue( queryTerm, term2Letter(queryTerm), self.useSignedInts) or queryTerm whereClauses.append("%s.%s" % \ (self,self.columnNames[idx][0])+" = %s") whereParameters.append(compVal) return ' AND '.join(whereClauses),whereParameters
def updateIdentifierQueue(self, termList): for term, termType in termList: md5Int = normalizeValue(term, termType, self.useSignedInts) self.hashUpdateQueue[md5Int] = self.normalizeTerm(term)
def CountTriples(cursor, table, specifiedColumns, variableColumns): d = {} specCols = [] hardCodedSpecCols = [] varCols = [] # hardCodedVarCols = [] # not needed indexPos = {} for i in specifiedColumns: if table.columnNames[i] != None: indexPos[i] = ('spec',len(specCols)) specCols.append(table.columnNames[i]) else: indexPos[i] = ('hard',len(hardCodedSpecCols)) hardCodedSpecCols.append(normalizeValue(table.hardCodedResultFields[i],'U')) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) for i in variableColumns: indexPos[i] = ('var', -1) if table.columnNames[i] != None: varCols.append(table.columnNames[i]) # else # hardCodedVarCols.append(normalizeValue(table.hardCodedResultFields[i],'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) #Assumes column lists in (s,p,o) order if len(specCols) == 0: cursor.execute(""" SELECT COUNT(*) AS tripleCount FROM %s;""" % (table)) if len(specCols) == 1: cursor.execute(""" SELECT %s AS givenCol, COUNT(*) AS tripleCount FROM %s GROUP BY %s;""" % (specCols[0], table, specCols[0])) # for (givenCol,tripleCount) in cursor.fetchall(): # d['%s=%s'%(specCols[0],givenCol) ] = tripleCount elif len(specCols) == 2: cursor.execute(""" SELECT %s AS givenCol1, %s AS givenCol2, COUNT(*) AS tripleCount FROM %s GROUP BY %s, %s;""" % (specCols[0], specCols[1], table, specCols[0], specCols[1])) # for (givenCol,tripleCount) in cursor.fetchall(): # d['%s_triples_%s=%s'%(table,givenCols[0],givenCol) ] = tripleCount elif len(specCols) == 3: cursor.execute(""" SELECT %s AS givenCol1, %s AS givenCol2, %s AS givenCol3 COUNT(*) AS tripleCount FROM %s GROUP BY %s, %s, %s;""" % (specCols[0], specCols[1], specCols[2], table, specCols[0], specCols[1], specCols[2])) for t in cursor.fetchall(): key = [] for i in (SUBJECT,PREDICATE,OBJECT,CONTEXT): if indexPos.has_key(i): (type,pos) = indexPos[i] if type == 'spec': key.append('%s=%s'%(i,t[pos])) elif type == 'hard': key.append('%s=%s'%(i,hardCodedSpecCols[pos])) d[','.join(key)] = t[len(t)-1] names = [] for i in (SUBJECT,PREDICATE,OBJECT,CONTEXT): if indexPos.has_key(i): (type,pos) = indexPos[i] if type == 'var': names.append('?') elif type == 'spec' or type == 'hard': names.append(str(i)) print(' Entries for triple pattern (%s) index: %s' % (','.join(names), len(d))) return d
def updateIdentifierQueue(self,termList): for term,termType in termList: md5Int = normalizeValue(term, termType, self.useSignedInts) self.hashUpdateQueue[md5Int]=self.normalizeTerm(term)
def generateWhereClause(self, queryPattern): """ Takes a query pattern (a list of quad terms - subject,predicate,object,context) and generates a SQL WHERE clauses which works in conjunction to the intersections to filter the result set by partial matching (by REGEX), full matching (by integer half-hash), and term types. For maximally efficient SELECT queries """ whereClauses = [] whereParameters = [] asserted = dereferenceQuad(CONTEXT, queryPattern) is None for idx in SlotPrefixes.keys(): queryTerm = dereferenceQuad(idx, queryPattern) lookupAlias = 'rt_' + SlotPrefixes[idx] if idx == CONTEXT and asserted: whereClauses.append("%s.%s_term != 'F'" % \ (self,self.columnNames[idx])) if idx < len(POSITION_LIST) and isinstance(queryTerm, REGEXTerm): whereClauses.append("%s.lexical REGEXP " % lookupAlias + "%s") whereParameters.append(queryTerm) elif idx == CONTEXT \ and isinstance(queryTerm,Graph) \ and isinstance(queryTerm.identifier,REGEXTerm): whereClauses.append("%s.lexical REGEXP " % lookupAlias + "%s") whereParameters.append(queryTerm.identifier) elif idx < len(POSITION_LIST) and queryTerm is not Any: if self.columnNames[idx]: if isinstance(queryTerm, list): whereClauses.append("%s.%s" % \ (self,self.columnNames[idx])+" in (%s)" % \ ','.join([ '%s' for item in range(len(queryTerm)) ])) whereParameters.extend([ normalizeValue(item, term2Letter(item), self.useSignedInts) for item in queryTerm ]) else: whereClauses.append("%s.%s" % \ (self,self.columnNames[idx])+" = %s") whereParameters.append( normalizeValue(queryTerm, term2Letter(queryTerm), self.useSignedInts)) if not idx in self.hardCodedResultTermsTypes \ and self.termEnumerations[idx] \ and not isinstance(queryTerm,list): whereClauses.append("%s.%s_term" % \ (self,self.columnNames[idx])+" = %s") whereParameters.append(term2Letter(queryTerm)) elif idx >= len(POSITION_LIST) \ and len(self.columnNames) > len(POSITION_LIST) \ and queryTerm is not None: compVal = idx == DATATYPE_INDEX and normalizeValue( queryTerm, term2Letter(queryTerm), self.useSignedInts) or queryTerm whereClauses.append("%s.%s" % \ (self,self.columnNames[idx][0])+" = %s") whereParameters.append(compVal) return ' AND '.join(whereClauses), whereParameters
def GarbageCollectionQUERY(idHash,valueHash,aBoxPart,binRelPart,litPart): """ Performs garbage collection on interned identifiers and their references. Joins the given KB partitions against the identifiers and values and removes the 'danglers'. This must be performed after every removal of an assertion and so becomes a primary bottleneck """ purgeQueries = ["drop temporary table if exists danglingIds"] rdfTypeInt = normalizeValue(RDF.type,'U') idHashKeyName = idHash.columns[0][0] valueHashKeyName = valueHash.columns[0][0] idHashJoinees = [aBoxPart,binRelPart,litPart] idJoinClauses = [] idJoinColumnCandidates = [] explicitJoins = [] for part in idHashJoinees: partJoinClauses = [] for colName in part.columnNames: if part.columnNames.index(colName) >= 4: colName,sqlType,index = colName if sqlType.lower()[:6]=='bigint': partJoinClauses.append("%s.%s = %s.%s" % \ (part,colName,idHash,idHashKeyName)) idJoinColumnCandidates.append("%s.%s" % (part,colName)) elif colName: partJoinClauses.append("%s.%s = %s.%s" % \ (part,colName,idHash,idHashKeyName)) idJoinColumnCandidates.append("%s.%s" % (part,colName)) explicitJoins.append("left join %s on (%s)" % \ (part,' or '.join(partJoinClauses))) idJoinClauses.extend(partJoinClauses) intersectionClause = " and ".join([col + " is NULL" for col in idJoinColumnCandidates]) idGCQuery = IDENTIFIER_GARBAGE_COLLECTION_SQL%( idHash, idHashKeyName, idHash, ' '.join(explicitJoins), intersectionClause, idHash, idHashKeyName, rdfTypeInt ) idPurgeQuery = PURGE_KEY_SQL % \ (idHash,idHash,idHashKeyName,idHash,idHashKeyName) purgeQueries.append(idGCQuery) purgeQueries.append(idPurgeQuery) partJoinClauses = [] idJoinColumnCandidates = [] explicitJoins = [] partJoinClauses.append("%s.%s = %s.%s" % \ (litPart,litPart.columnNames[OBJECT],valueHash,valueHashKeyName)) idJoinColumnCandidates.append("%s.%s" % \ (litPart,litPart.columnNames[OBJECT])) intersectionClause = " and ".join([col + " is NULL" for col in idJoinColumnCandidates]) valueGCQuery = VALUE_GARBAGE_COLLECTION_SQL%( valueHash, valueHashKeyName, valueHash, "left join %s on (%s)"%(litPart,' or '.join(partJoinClauses)), intersectionClause ) valuePurgeQuery = PURGE_KEY_SQL % \ (valueHash,valueHash,valueHashKeyName,valueHash,valueHashKeyName) purgeQueries.append("drop temporary table if exists danglingIds") purgeQueries.append(valueGCQuery) purgeQueries.append(valuePurgeQuery) return purgeQueries
def CountTriples(cursor, table, specifiedColumns, variableColumns, hashColumns=[], hashBuckets=200): d = {} statStartTime = time.time() specCols = [] hardCodedSpecCols = [] varCols = [] #hardCodedVarCols = [] # not needed indexPos = {} distinctCount = [] for i in specifiedColumns: if table.columnNames[i] != None: indexPos[i] = ('spec',len(specCols)) if not i in hashColumns: specCols.append(table.columnNames[i]) # use all values of this variable column if len(hashColumns) > 0: distinctCount.append(table.columnNames[i]) else: # use k hash buckets as a string histogram specCols.append("MOD(%s,%s)"%(table.columnNames[i],hashBuckets)) distinctCount.append(table.columnNames[i]) else: indexPos[i] = ('hard',len(hardCodedSpecCols)) hardCodedSpecCols.append(normalizeValue(table.hardCodedResultFields[i],'U')) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) for i in variableColumns: indexPos[i] = ('var', -1) if table.columnNames[i] != None: varCols.append(table.columnNames[i]) #else # hardCodedVarCols.append(normalizeValue(table.hardCodedResultFields[i],'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type) distinctClause = "" if len(distinctCount) > 0: distinctClause = " COUNT(DISTINCT %s) AS distinctCount," % ( ",".join(distinctCount)) #Assumes column lists in (s,p,o) order if len(specCols) == 0: cursor.execute(""" SELECT COUNT(*) AS tripleCount FROM %s;""" % (table)) if len(specCols) == 1: cursor.execute(""" SELECT %s AS givenCol, %s COUNT(*) AS tripleCount FROM %s GROUP BY %s;""" % (specCols[0], distinctClause, table, specCols[0])) # for (givenCol,tripleCount) in cursor.fetchall(): # d['%s=%s'%(specCols[0],givenCol) ] = tripleCount elif len(specCols) == 2: cursor.execute(""" SELECT %s AS givenCol1, %s AS givenCol2, %s COUNT(*) AS tripleCount FROM %s GROUP BY %s, %s;""" % ( specCols[0], specCols[1], distinctClause, table, specCols[0], specCols[1])) # for (givenCol,tripleCount) in cursor.fetchall(): # d['%s_triples_%s=%s'%(table,givenCols[0],givenCol) ] = tripleCount elif len(specCols) == 3: cursor.execute(""" SELECT %s AS givenCol1, %s AS givenCol2, %s AS givenCol3, %s COUNT(*) AS tripleCount FROM %s GROUP BY %s, %s, %s;""" % ( specCols[0], specCols[1], specCols[2], distinctClause, table, specCols[0], specCols[1], specCols[2])) for t in cursor.fetchall(): key = [] for i in (SUBJECT,PREDICATE,OBJECT,CONTEXT): if indexPos.has_key(i): (type,pos) = indexPos[i] if type == 'spec': key.append('%s=%s' % (i, t[pos])) elif type == 'hard': key.append('%s=%s' % (i, hardCodedSpecCols[pos])) d[','.join(key)] = t[len(t)-1] if len(distinctCount) > 0: # for histograms, also get the # distinct values in the bucket d[','.join(key)+'dist'] = t[len(t)-2] names = [] for i in (SUBJECT, PREDICATE, OBJECT, CONTEXT): if indexPos.has_key(i): (type,pos) = indexPos[i] if type == 'var': names.append('?') elif type == 'spec' or type == 'hard': names.append(str(i)) d['countTime'] = time.time()-statStartTime if len(distinctCount) < 1: print(' Entries for triple pattern (%s) index: %s' % (','.join(names), len(d))) else: print(' Entries for histogram triple pattern (%s) index: %s (%s counts)' % ( ','.join(names), (len(d)-1)/2, len(d)-1)) return d
def GarbageCollectionQUERY(idHash, valueHash, aBoxPart, binRelPart, litPart): """ Performs garbage collection on interned identifiers and their references. Joins the given KB partitions against the identifiers and values and removes the 'danglers'. This must be performed after every removal of an assertion and so becomes a primary bottleneck """ purgeQueries = ["drop temporary table if exists danglingIds"] rdfTypeInt = normalizeValue(RDF.type, 'U') idHashKeyName = idHash.columns[0][0] valueHashKeyName = valueHash.columns[0][0] idHashJoinees = [aBoxPart, binRelPart, litPart] idJoinClauses = [] idJoinColumnCandidates = [] explicitJoins = [] for part in idHashJoinees: partJoinClauses = [] for colName in part.columnNames: if part.columnNames.index(colName) >= 4: colName, sqlType, index = colName if sqlType.lower()[:6] == 'bigint': partJoinClauses.append("%s.%s = %s.%s" % \ (part,colName,idHash,idHashKeyName)) idJoinColumnCandidates.append("%s.%s" % (part, colName)) elif colName: partJoinClauses.append("%s.%s = %s.%s" % \ (part,colName,idHash,idHashKeyName)) idJoinColumnCandidates.append("%s.%s" % (part, colName)) explicitJoins.append("left join %s on (%s)" % \ (part,' or '.join(partJoinClauses))) idJoinClauses.extend(partJoinClauses) intersectionClause = " and ".join( [col + " is NULL" for col in idJoinColumnCandidates]) idGCQuery = IDENTIFIER_GARBAGE_COLLECTION_SQL % ( idHash, idHashKeyName, idHash, ' '.join(explicitJoins), intersectionClause, idHash, idHashKeyName, rdfTypeInt) idPurgeQuery = PURGE_KEY_SQL % \ (idHash,idHash,idHashKeyName,idHash,idHashKeyName) purgeQueries.append(idGCQuery) purgeQueries.append(idPurgeQuery) partJoinClauses = [] idJoinColumnCandidates = [] explicitJoins = [] partJoinClauses.append("%s.%s = %s.%s" % \ (litPart,litPart.columnNames[OBJECT],valueHash,valueHashKeyName)) idJoinColumnCandidates.append("%s.%s" % \ (litPart,litPart.columnNames[OBJECT])) intersectionClause = " and ".join( [col + " is NULL" for col in idJoinColumnCandidates]) valueGCQuery = VALUE_GARBAGE_COLLECTION_SQL % ( valueHash, valueHashKeyName, valueHash, "left join %s on (%s)" % (litPart, ' or '.join(partJoinClauses)), intersectionClause) valuePurgeQuery = PURGE_KEY_SQL % \ (valueHash,valueHash,valueHashKeyName,valueHash,valueHashKeyName) purgeQueries.append("drop temporary table if exists danglingIds") purgeQueries.append(valueGCQuery) purgeQueries.append(valuePurgeQuery) return purgeQueries
def GetDatabaseStats(graph): print('Gathering statistics...') startTime = time.time() stats = dict() stats[ 'triples'] = 0 #len(graph) #ISSUE: len(graph) only gives count for default graph??? stats['cacheName'] = graph.store.identifier + "-" + \ str(normalizeValue(graph.store.configuration, "L")) stats['storeName'] = graph.store.identifier stats['internedId'] = graph.store._internedId stats['config'] = graph.store.configuration tables = dict(type=graph.store.aboxAssertions, lit=graph.store.literalProperties, rel=graph.store.binaryRelations, all=graph.store._internedId + '_all') # FIXME Unused code realTables = dict(type=graph.store.aboxAssertions, lit=graph.store.literalProperties, rel=graph.store.binaryRelations) # columnNames[OBJECT] cursor = graph.store._db.cursor() # distinct num. of subjects, predicates, & objects tableType = 'all' statStartTime = time.time() stats['subjects'] = CountDistint(cursor, tables[tableType], 'subject') stats['predicates'] = CountDistint(cursor, tables[tableType], 'predicate') stats['objects'] = CountDistint(cursor, tables[tableType], 'object') stats['distTime'] = time.time() - statStartTime for tableType in ['lit', 'rel', 'type']: table = tables[tableType] # total # triples cursor.execute(""" SELECT COUNT(*) FROM %s """ % table) triples = cursor.fetchone()[0] stats[tableType + '_triples'] = triples stats['triples'] = stats['triples'] + triples print(' Processing table %s: %s triples...' % (tableType, triples)) # distinct num. of subjects, predicates, & objects statStartTime = time.time() stats[tableType + '_subjects'] = CountDistint( cursor, table, table.columnNames[SUBJECT]) stats[tableType + '_predicates'] = CountDistint( cursor, table, table.columnNames[PREDICATE]) stats[tableType + '_objects'] = CountDistint(cursor, table, table.columnNames[OBJECT]) stats[tableType + '_distTime'] = time.time() - statStartTime # subject/object counts for predicates statStartTime = time.time() stats[tableType + '_colDist'] = {} stats[tableType + '_colDist']['obj_for_pred'] = CountDistinctForColumn( cursor, table, PREDICATE, OBJECT) stats[tableType + '_colDist']['sub_for_pred'] = CountDistinctForColumn( cursor, table, PREDICATE, SUBJECT) stats[tableType + '_colDistTime'] = time.time() - statStartTime # triple pattern occurrence counts statStartTime = time.time() stats[tableType + '_pat'] = {} stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)] = CountTriples( cursor, table, [SUBJECT], [PREDICATE, OBJECT]) stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)] = CountTriples( cursor, table, [PREDICATE], [SUBJECT, OBJECT]) stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)] = CountTriples( cursor, table, [OBJECT], [SUBJECT, PREDICATE]) stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)] = CountTriples( cursor, table, [SUBJECT, PREDICATE], [OBJECT]) stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)] = CountTriples( cursor, table, [PREDICATE, OBJECT], [SUBJECT]) stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)] = CountTriples( cursor, table, [SUBJECT, OBJECT], [PREDICATE]) stats[tableType + '_patTime'] = time.time() - statStartTime # predicate co-occurrence statStartTime = time.time() # stats['join_s-s'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, SUBJECT) # stats['join_s-o'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, OBJECT) # stats['join_o-s'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, SUBJECT) # stats['join_o-o'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, OBJECT) stats['joinTime'] = time.time() - statStartTime cursor.close() endTime = time.time() - startTime print('Statistics gathered in %s ms' % (endTime)) stats['elapsedTime'] = endTime return stats