Example #1
0
def PredicateJoinCount(cursor, graph, realTables, joinField1, joinField2):
    d = {}
    #predField = 'predicate'
    #tableName = graph.store._internedId + '_all'
    
    for table1 in realTables:
        t1 = realTables[table1]
        if t1.columnNames[PREDICATE] == None:
            predStr = t1.hardCodedResultFields[PREDICATE]
            pred1 = normalizeValue(predStr,'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
        else:
            pred1 = "t1.%s"%(t1.columnNames[PREDICATE])
        
        for table2 in realTables:
            t2 = realTables[table2]
            if t2.columnNames[PREDICATE] == None:
                predStr = t2.hardCodedResultFields[PREDICATE]
                pred2 = normalizeValue(predStr,'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
            else:
                pred2 = "t2.%s"%(t2.columnNames[PREDICATE])
            
            cursor.execute("""
                SELECT %s AS pred1, %s AS pred2, COUNT(*) as tupleCount
                FROM %s t1, %s t2
                WHERE t1.%s = t2.%s
                GROUP BY pred1, pred2
            """%(pred1, pred2,
                 t1, t2,
                 t1.columnNames[joinField1], t2.columnNames[joinField2]))
            for (p1,p2,tupleCount) in cursor.fetchall():
                d[(p1,p2)] = tupleCount
    
    print('    Entries for join type (%s-%s) index: %s' % (joinField1, joinField2, len(d)))
    return d
Example #2
0
    def viewUnionSelectExpression(self, relations_only=False):
        """
        Return a SQL statement which creates a view of all the RDF statements
        from all the contributing partitions
        """
        rt = []
        if relations_only and self.objectPropertyTable:
            return "select * from %s" % repr(self)

        if self.useSignedInts:
            int_cast = 'BIGINT'
        else:
            int_cast = 'UNSIGNED BIGINT'

        for idx in range(len(POSITION_LIST)):
            rdfTermLabel = SlotPrefixes[idx]
            if idx < len(self.columnNames) and self.columnNames[idx]:
                #there is a matching column
                rt.append(self.columnNames[idx] + ' as %s' % rdfTermLabel)
                if self.termEnumerations[idx]:
                    #there is a corresponding term enumeration
                    rt.append(self.columnNames[idx] + '_term as %s_term' %
                              (rdfTermLabel, ))
                else:
                    #no corresponding term enumeration (hardcoded)
                    rt.append(
                        "CAST('%s' as CHAR) as %s_term" %
                        (self.hardCodedResultTermsTypes[idx], rdfTermLabel))
            else:
                assert self.hardCodedResultFields[idx] == RDF.type
                if not self.store.can_cast_bigint:
                    rt.append(
                        "%s as %s" %
                        (normalizeValue(self.hardCodedResultFields[idx], 'U',
                                        self.useSignedInts), rdfTermLabel))
                else:
                    rt.append("CAST('%s' as %s) as %s" % (normalizeValue(
                        self.hardCodedResultFields[idx], 'U',
                        self.useSignedInts), int_cast, rdfTermLabel))
                if self.hardCodedResultTermsTypes[idx]:
                    rt.append(
                        "CAST('%s' as CHAR) as %s_term" %
                        (self.hardCodedResultTermsTypes[idx], rdfTermLabel))
        if not relations_only:
            if self.literalTable:
                for i in self.columnNames[-2:]:
                    rt.append(i[0])
            else:
                if not self.store.can_cast_bigint:
                    rt.append('NULL as data_type')
                else:
                    rt.append('CAST(NULL as %s) as data_type' % (int_cast, ))
                rt.append('CAST(NULL as char(3)) as language')
        return "select %s from %s" % (', '.join(rt), repr(self))
 def viewUnionSelectExpression(self,relations_only=False):
     """
     Return a SQL statement which creates a view of all the RDF statements
     from all the contributing partitions
     """
     rt=[]
     if relations_only and self.objectPropertyTable:
         return "select * from %s"%repr(self)
     
     if self.useSignedInts:
         int_cast = 'BIGINT'
     else:
         int_cast = 'UNSIGNED BIGINT'
     
     for idx in range(len(POSITION_LIST)):
         rdfTermLabel=SlotPrefixes[idx]
         if idx < len(self.columnNames) and self.columnNames[idx]:
             #there is a matching column
             rt.append(self.columnNames[idx]+' as %s'%rdfTermLabel)
             if self.termEnumerations[idx]:
                 #there is a corresponding term enumeration
                 rt.append(self.columnNames[idx] +
                           '_term as %s_term' % (rdfTermLabel,))
             else:
                 #no corresponding term enumeration (hardcoded)
                 rt.append("CAST('%s' as CHAR) as %s_term" %
                             (self.hardCodedResultTermsTypes[idx],
                              rdfTermLabel))
         else:
             assert self.hardCodedResultFields[idx] == RDF.type
             if not self.store.can_cast_bigint:
                 rt.append("%s as %s" % (normalizeValue(
                   self.hardCodedResultFields[idx], 'U',
                   self.useSignedInts), rdfTermLabel))
             else:
                 rt.append("CAST('%s' as %s) as %s" % (normalizeValue(
                   self.hardCodedResultFields[idx], 'U',
                   self.useSignedInts), int_cast, rdfTermLabel))
             if self.hardCodedResultTermsTypes[idx]:
                 rt.append("CAST('%s' as CHAR) as %s_term" %
                             (self.hardCodedResultTermsTypes[idx],
                              rdfTermLabel))
     if not relations_only:
         if self.literalTable:
             for i in self.columnNames[-2:]:
                 rt.append(i[0])
         else:
             if not self.store.can_cast_bigint:
                 rt.append('NULL as data_type')
             else:
                 rt.append('CAST(NULL as %s) as data_type' % (int_cast,))
             rt.append('CAST(NULL as char(3)) as language')
     return "select %s from %s"%(', '.join(rt),repr(self))
Example #4
0
def GetCachedStats(graph, cacheFolder, alwaysGen=False, genMissing=True, doJoins=False):
    fileName = os.path.join(cacheFolder, graph.store.identifier + "-" + str(
                        normalizeValue(graph.store.configuration, "L"))) + ".cache"
    version = "0.1"
    genStats = genMissing
    stats = LoadCachedStats(fileName, version)
    
    if stats != None:
        genStats = alwaysGen
    
    if genStats:
        print('Generating data statistics...')
        startTime = time.time()
        stats = GetDatabaseStats(graph, stats, doJoins) # update stats
        print(' done in %s s' % (time.time()-startTime))
        
        # save stats to disk
        print('Saving data statistics...')
        startTime = time.time()
        f = open(fileName, 'w')
        cPickle.dump(version, f)
        cPickle.dump(stats, f)
        f.close()
        #print ' done in %s s' % (time.time()-startTime)
        
    return stats    
Example #5
0
def CountDistinctForColumn(cursor, table, mainColumn, countColumn):
    d = {}

    if table.columnNames[mainColumn] == None:
        pred = table.hardCodedResultFields[mainColumn]
        predInt = normalizeValue(
            pred, 'U'
        )  #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
        cursor.execute("""
            SELECT COUNT(DISTINCT %s) AS objCount
            FROM %s;""" % (table.columnNames[countColumn], table))
        for (objCount) in cursor.fetchall():
            d[predInt] = objCount
    else:
        cursor.execute(
            """
            SELECT %s AS pred, COUNT(DISTINCT %s) AS objCount
            FROM %s
            GROUP BY %s;""" %
            (table.columnNames[mainColumn], table.columnNames[countColumn],
             table, table.columnNames[mainColumn]))
        for (pred, objCount) in cursor.fetchall():
            d[pred] = objCount

    print('    Distinct value entries in %s for column %s: %s' %
          (countColumn, mainColumn, len(d)))
    return d
Example #6
0
 def makeRowComponents(self, quadSlots):
     subjSlot, predSlot, objSlot, conSlot = quadSlots
     dTypeParam = objSlot.term.datatype and normalizeValue(
         objSlot.term.datatype, 'U', self.useSignedInts) or None
     langParam = objSlot.term.language and objSlot.term.language or None
     return (subjSlot.md5Int, subjSlot.termType, predSlot.md5Int,
             predSlot.termType, objSlot.md5Int, conSlot.md5Int,
             conSlot.termType, dTypeParam, langParam)
Example #7
0
 def defaultStatements(self):
     """
     Since rdf:type is modeled explicitely (in the ABOX partition) it
     must be inserted as a 'default' identifier.
     """
     return ["INSERT INTO %s VALUES (%s, 'U', '%s');" %
               (self, normalizeValue(RDF.type, 'U', self.useSignedInts),
                RDF.type)]
Example #8
0
 def defaultStatements(self):
     """
     Since rdf:type is modeled explicitely (in the ABOX partition) it
     must be inserted as a 'default' identifier.
     """
     return ["INSERT INTO %s VALUES (%s, 'U', '%s');" %
               (self, normalizeValue(RDF.type, 'U', self.useSignedInts),
                RDF.type)]
 def makeRowComponents(self, quadSlots):
     subjSlot, predSlot, objSlot, conSlot = quadSlots
     dTypeParam = objSlot.term.datatype and normalizeValue(
       objSlot.term.datatype, 'U', self.useSignedInts) or None
     langParam  = objSlot.term.language and objSlot.term.language or None
     return (subjSlot.md5Int, subjSlot.termType,
             predSlot.md5Int, predSlot.termType,
             objSlot.md5Int,
             conSlot.md5Int, conSlot.termType,
             dTypeParam, langParam)
Example #10
0
def PredicateJoinCount(cursor, graph, realTables, joinField1, joinField2):
    d = {}
    #predField = 'predicate'
    #tableName = graph.store._internedId + '_all'

    for table1 in realTables:
        t1 = realTables[table1]
        if t1.columnNames[PREDICATE] == None:
            predStr = t1.hardCodedResultFields[PREDICATE]
            pred1 = normalizeValue(
                predStr, 'U'
            )  #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
        else:
            pred1 = "t1.%s" % (t1.columnNames[PREDICATE])

        for table2 in realTables:
            t2 = realTables[table2]
            if t2.columnNames[PREDICATE] == None:
                predStr = t2.hardCodedResultFields[PREDICATE]
                pred2 = normalizeValue(
                    predStr, 'U'
                )  #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
            else:
                pred2 = "t2.%s" % (t2.columnNames[PREDICATE])

            cursor.execute("""
                SELECT %s AS pred1, %s AS pred2, COUNT(*) as tupleCount
                FROM %s t1, %s t2
                WHERE t1.%s = t2.%s
                GROUP BY pred1, pred2
            """ % (pred1, pred2, t1, t2, t1.columnNames[joinField1],
                   t2.columnNames[joinField2]))
            for (p1, p2, tupleCount) in cursor.fetchall():
                d[(p1, p2)] = tupleCount

    print('    Entries for join type (%s-%s) index: %s' %
          (joinField1, joinField2, len(d)))
    return d
Example #11
0
 def compileQuadToParams(self, quadSlots):
     subjSlot, predSlot, objSlot, conSlot = quadSlots
     dTypeParam = objSlot.term.datatype and normalizeValue(
         objSlot.term.datatype, 'U', self.useSignedInts) or None
     langParam = objSlot.term.language and objSlot.term.language or None
     rtList = [
         subjSlot.md5Int,
         term2Letter(subjSlot.term), predSlot.md5Int,
         term2Letter(predSlot.term), objSlot.md5Int, conSlot.md5Int,
         term2Letter(conSlot.term)
     ]
     for item in [dTypeParam, langParam]:
         if item:
             rtList.append(item)
     return tuple(rtList)
 def compileQuadToParams(self,quadSlots):
     subjSlot,predSlot,objSlot,conSlot = quadSlots
     dTypeParam = objSlot.term.datatype and normalizeValue(
       objSlot.term.datatype, 'U', self.useSignedInts) or None
     langParam  = objSlot.term.language and objSlot.term.language or None
     rtList = [
                 subjSlot.md5Int,
                 term2Letter(subjSlot.term),
                 predSlot.md5Int,
                 term2Letter(predSlot.term),
                 objSlot.md5Int,
                 conSlot.md5Int,
                 term2Letter(conSlot.term)]
     for item in [dTypeParam,langParam]:
         if item:
             rtList.append(item)
     return tuple(rtList)
Example #13
0
def CountDistinctForColumn(cursor, table, mainColumn, countColumn):
    d = {}
    
    if table.columnNames[mainColumn] == None:
        pred = table.hardCodedResultFields[mainColumn]
        predInt = normalizeValue(pred,'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
        cursor.execute("""
            SELECT COUNT(DISTINCT %s) AS objCount
            FROM %s;""" % (table.columnNames[countColumn], table))
        for (objCount) in cursor.fetchall():
            d[predInt] = objCount
    else:
        cursor.execute("""
            SELECT %s AS pred, COUNT(DISTINCT %s) AS objCount
            FROM %s
            GROUP BY %s;""" % (table.columnNames[mainColumn], table.columnNames[countColumn], table, table.columnNames[mainColumn]))
        for (pred,objCount) in cursor.fetchall():
            d[pred] = objCount
    
    print('    Distinct value entries in %s for column %s: %s' % (countColumn, mainColumn, len(d)))
    return d
Example #14
0
def GetCachedStats(graph, cacheFolder):
    fileName = os.path.join(cacheFolder, graph.store.identifier + "-" + \
        str(normalizeValue(graph.store.configuration, "L"))) + ".cache"
    version = "0.1"
    genStats = True

    if os.path.exists(fileName):
        print('Reloading data statistics from cache file...')
        # reload previous created data stats cache file
        f = open(fileName, 'r')
        loadVersion = cPickle.load(f)
        if (version == loadVersion):
            startTime = time.time()
            stats = cPickle.load(f)
            genStats = False
            f.close()
            print(' done in %s s' % (time.time() - startTime))
        else:
            f.close()
            print('Saved statistics in wrong version! Must be re-generated.')
            os.remove(fileName)

    if genStats:
        print('Generating data statistics...')
        startTime = time.time()
        stats = GetDatabaseStats(graph)
        print(' done in %s s' % (time.time() - startTime))

        # save stats to disk
        print('Saving data statistics...')
        startTime = time.time()
        f = open(fileName, 'w')
        cPickle.dump(version, f)
        cPickle.dump(stats, f)
        f.close()
        #print(' done in %s s' % (time.time()-startTime))

    return stats
Example #15
0
def GetCachedStats(graph, cacheFolder):
    fileName = os.path.join(cacheFolder, graph.store.identifier + "-" + \
        str(normalizeValue(graph.store.configuration, "L"))) + ".cache"
    version = "0.1"
    genStats = True
    
    if os.path.exists(fileName):
        print('Reloading data statistics from cache file...')
        # reload previous created data stats cache file
        f = open(fileName, 'r')
        loadVersion = cPickle.load(f)
        if (version == loadVersion):
            startTime = time.time()
            stats = cPickle.load(f)
            genStats = False
            f.close()
            print(' done in %s s' % (time.time()-startTime))
        else:
            f.close()
            print('Saved statistics in wrong version! Must be re-generated.')
            os.remove(fileName)
    
    if genStats:
        print('Generating data statistics...')
        startTime = time.time()
        stats = GetDatabaseStats(graph)
        print(' done in %s s' % (time.time()-startTime))
        
        # save stats to disk
        print('Saving data statistics...')
        startTime = time.time()
        f = open(fileName, 'w')
        cPickle.dump(version, f)
        cPickle.dump(stats, f)
        f.close()
        #print(' done in %s s' % (time.time()-startTime))
    
    return stats
Example #16
0
def test_dType_encoding():
    # storetest = True
    # correct = normalizeValue('http://www.w3.org/2001/XMLSchema#integer', 'U')
    wrong = normalizeValue('http://www.w3.org/2001/XMLSchema#integer', 'L')

    store = plugin.get('MySQL', Store)()
    store.destroy(configString)
    store.open(configString, create=True)
    Graph(store).add((BNode(), URIRef('foo'), Literal(1)))
    db = store._db
    cursor = db.cursor()
    cursor.execute("select * from %s where data_type = '%s'" %
                   (store.literalProperties, wrong))
    assert not cursor.fetchone(), "Datatype encoding bug!"
    # for suffix,(relations_only,tables) in store.viewCreationDict.items():
    #     query='create view %s%s as %s'%(store._internedId,
    #                                     suffix,
    #     ' union all '.join([t.viewUnionSelectExpression(relations_only)
    #                         for t in tables]))
    #     # print "## Creating View ##\n",query

    store.rollback()
    store.destroy(configString)
    store.close()
Example #17
0
def GetDatabaseStats(graph):
    print('Gathering statistics...')
    startTime = time.time()
    
    stats = dict()
    stats['triples'] = 0 #len(graph) #ISSUE: len(graph) only gives count for default graph???
    
    stats['cacheName'] = graph.store.identifier + "-" + \
                        str(normalizeValue(graph.store.configuration, "L"))
    stats['storeName'] = graph.store.identifier
    stats['internedId'] = graph.store._internedId
    stats['config'] = graph.store.configuration
    
    tables = dict(type = graph.store.aboxAssertions,
                  lit = graph.store.literalProperties,
                  rel = graph.store.binaryRelations,
                  all = graph.store._internedId + '_all')
    # FIXME Unused code
    realTables = dict(type = graph.store.aboxAssertions,
                      lit = graph.store.literalProperties,
                      rel = graph.store.binaryRelations)
    # columnNames[OBJECT]
    
    cursor = graph.store._db.cursor()
    
    # distinct num. of subjects, predicates, & objects
    tableType = 'all'
    statStartTime = time.time()
    stats['subjects'] = CountDistint(cursor, tables[tableType], 'subject')
    stats['predicates'] = CountDistint(cursor, tables[tableType], 'predicate')
    stats['objects'] = CountDistint(cursor, tables[tableType], 'object')
    stats['distTime'] = time.time()-statStartTime
    
    for tableType in ['lit', 'rel', 'type']:
        
        table = tables[tableType]
        
        # total # triples
        cursor.execute(""" SELECT COUNT(*) FROM %s """ % table)
        triples = cursor.fetchone()[0]
        stats[tableType + '_triples'] = triples
        stats['triples'] = stats['triples'] + triples
        
        print('  Processing table %s: %s triples...' %(tableType,triples))
        
        # distinct num. of subjects, predicates, & objects
        statStartTime = time.time()
        stats[tableType + '_subjects'] = CountDistint(cursor, table, table.columnNames[SUBJECT])
        stats[tableType + '_predicates'] = CountDistint(cursor, table, table.columnNames[PREDICATE])
        stats[tableType + '_objects'] = CountDistint(cursor, table, table.columnNames[OBJECT])
        stats[tableType + '_distTime'] = time.time()-statStartTime
        
        # subject/object counts for predicates
        statStartTime = time.time()
        stats[tableType + '_colDist'] = {}
        stats[tableType + '_colDist']['obj_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, OBJECT)
        stats[tableType + '_colDist']['sub_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, SUBJECT)
        stats[tableType + '_colDistTime'] = time.time()-statStartTime
        
        # triple pattern occurrence counts
        statStartTime = time.time()
        stats[tableType + '_pat'] = {}
        stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)] = CountTriples(cursor, table, [SUBJECT], [PREDICATE,OBJECT])
        stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)] = CountTriples(cursor, table, [PREDICATE], [SUBJECT,OBJECT])
        stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)] = CountTriples(cursor, table, [OBJECT], [SUBJECT,PREDICATE])
        stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)] = CountTriples(cursor, table, [SUBJECT,PREDICATE], [OBJECT])
        stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)] = CountTriples(cursor, table, [PREDICATE, OBJECT], [SUBJECT])
        stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)] = CountTriples(cursor, table, [SUBJECT, OBJECT], [PREDICATE])
        stats[tableType + '_patTime'] = time.time()-statStartTime
    
    # predicate co-occurrence
    statStartTime = time.time()
    # stats['join_s-s'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, SUBJECT)
    # stats['join_s-o'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, OBJECT)
    # stats['join_o-s'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, SUBJECT)
    # stats['join_o-o'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, OBJECT)
    stats['joinTime'] = time.time()-statStartTime
    
    cursor.close()
    
    endTime = time.time()-startTime
    print('Statistics gathered in %s ms' % (endTime))
    stats['elapsedTime'] = endTime
    
    return stats
Example #18
0
def CountTriples(cursor, table, specifiedColumns, variableColumns):
    d = {}

    specCols = []
    hardCodedSpecCols = []
    varCols = []
    # hardCodedVarCols = [] # not needed
    indexPos = {}

    for i in specifiedColumns:
        if table.columnNames[i] != None:
            indexPos[i] = ('spec', len(specCols))
            specCols.append(table.columnNames[i])
        else:
            indexPos[i] = ('hard', len(hardCodedSpecCols))
            hardCodedSpecCols.append(
                normalizeValue(table.hardCodedResultFields[i], 'U')
            )  #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
    for i in variableColumns:
        indexPos[i] = ('var', -1)
        if table.columnNames[i] != None:
            varCols.append(table.columnNames[i])
        # else
        #     hardCodedVarCols.append(normalizeValue(table.hardCodedResultFields[i],'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)

    #Assumes column lists in (s,p,o) order
    if len(specCols) == 0:
        cursor.execute("""
            SELECT COUNT(*) AS tripleCount
            FROM %s;""" % (table))
    if len(specCols) == 1:
        cursor.execute("""
            SELECT %s AS givenCol, COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s;""" % (specCols[0], table, specCols[0]))
        # for (givenCol,tripleCount) in cursor.fetchall():
        #     d['%s=%s'%(specCols[0],givenCol) ] = tripleCount
    elif len(specCols) == 2:
        cursor.execute(
            """
            SELECT %s AS givenCol1, %s AS givenCol2, COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s, %s;""" %
            (specCols[0], specCols[1], table, specCols[0], specCols[1]))
        # for (givenCol,tripleCount) in cursor.fetchall():
        #     d['%s_triples_%s=%s'%(table,givenCols[0],givenCol) ] = tripleCount
    elif len(specCols) == 3:
        cursor.execute("""
            SELECT %s AS givenCol1, %s AS givenCol2, %s AS givenCol3 COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s, %s, %s;""" %
                       (specCols[0], specCols[1], specCols[2], table,
                        specCols[0], specCols[1], specCols[2]))

    for t in cursor.fetchall():
        key = []
        for i in (SUBJECT, PREDICATE, OBJECT, CONTEXT):
            if indexPos.has_key(i):
                (type, pos) = indexPos[i]
                if type == 'spec':
                    key.append('%s=%s' % (i, t[pos]))
                elif type == 'hard':
                    key.append('%s=%s' % (i, hardCodedSpecCols[pos]))
        d[','.join(key)] = t[len(t) - 1]

    names = []
    for i in (SUBJECT, PREDICATE, OBJECT, CONTEXT):
        if indexPos.has_key(i):
            (type, pos) = indexPos[i]
            if type == 'var':
                names.append('?')
            elif type == 'spec' or type == 'hard':
                names.append(str(i))

    print('    Entries for triple pattern (%s) index: %s' %
          (','.join(names), len(d)))
    return d
Example #19
0
def GetDatabaseStats(store, stats=None, doJoins=False):
    print('Gathering statistics...')
    startTime = time.time()
    
    if stats is None:
        stats = dict()
    stats['triples'] = 0
    
    stats['cacheName']  = store.identifier + "-" + str(
                    normalizeValue(store.configuration, "L"))
    stats['storeName']  = store.identifier
    stats['internedId'] = store._internedId
    stats['config']     = store.configuration    
    
    tables = dict(type = store.aboxAssertions,
                  lit = store.literalProperties,
                  rel = store.binaryRelations,
                  all = store._internedId + '_all')
    realTables = dict(type = store.aboxAssertions,
                      lit = store.literalProperties,
                      rel = store.binaryRelations)    
    
    cursor = store._db.cursor()
    
    # distinct num. of subjects, predicates, & objects (NOTE: we always want these!)
    statStartTime = time.time()
    stats['subjects'] = CountDistint(cursor, tables['all'], 'subject')   
    stats['predicates'] = CountDistint(cursor, tables['all'], 'predicate') 
    stats['objects'] = CountDistint(cursor, tables['all'], 'object')
    stats['distTime'] = time.time()-statStartTime
    
    if not stats.has_key('colDistTime'):
        stats['colDistTime'] = 0
    
    stats['(%s,?,?)_patTime' % (SUBJECT)] = 0
    stats['(?,%s,?)_patTime' % (PREDICATE)] = 0
    stats['(?,?,%s)_patTime' % (OBJECT)] = 0
    stats['(%s,%s,?)_patTime' % (SUBJECT, PREDICATE)] = 0
    stats['(?,%s,%s)_patTime' % (PREDICATE, OBJECT)] = 0
    stats['(%s,?,%s)_patTime' % (SUBJECT, OBJECT)] = 0
                
    tableType = 'lit'
    for h in histogramSizes:
        if not stats.has_key(tableType + '_pat') \
            or  not stats[tableType + '_pat'].has_key(
                '(%sh%s,?,?)' % (SUBJECT, h)):
            stats['(%sh%s,?,?)_patTime' % (SUBJECT, h)] = 0
            stats['(?,?,%sh%s)_patTime' % (OBJECT,h)] = 0        
            stats['(?,%s,%sh%s)_patTime' % (PREDICATE, OBJECT, h)] = 0             
    
    for tableType in ['lit', 'rel', 'type']:
        
        table = tables[tableType]

        # Statistics on ENTIRE DATABASE (completely unspecified triple pattern)

        # total # triples (NOTE: we always want these!)
        if not stats.has_key(tableType + '_triples'):
            cursor.execute(""" SELECT COUNT(*) FROM %s """ % table) 
            triples = cursor.fetchone()[0]
            stats[tableType + '_triples'] = triples
        
            print '  Processing table %s: %s triples...' % (
                                                    tableType, triples)
        stats['triples'] += stats[tableType + '_triples']
        
        # distinct num. of subjects, predicates, & objects
        if not stats.has_key(tableType + '_subjects'):
            statStartTime = time.time()
            stats[tableType + '_subjects'] = CountDistint(cursor, table, table.columnNames[SUBJECT])
            stats[tableType + '_predicates'] = CountDistint(cursor, table, table.columnNames[PREDICATE])
            stats[tableType + '_objects'] = CountDistint(cursor, table, table.columnNames[OBJECT])
            stats[tableType + '_distTime'] = time.time()-statStartTime
            
        # subject/object counts for predicates (NOTE: used for greedy ordering algorithm; some cost formulas; always want)
        #if not stats.has_key(tableType + '_colDist'):
        statStartTime = time.time()
        stats[tableType + '_colDist'] = {}
        stats[tableType + '_colDist']['obj_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, OBJECT)       
        stats[tableType + '_colDist']['sub_for_pred'] = CountDistinctForColumn(cursor, table, PREDICATE, SUBJECT)
        stats[tableType + '_colDistTime'] = time.time()-statStartTime
        stats['colDistTime'] += stats[tableType + '_colDistTime']

        # triple pattern occurrence counts (NOTE: takes too much space to store all of these! Choose wisely)
        if not stats.has_key(tableType + '_pat'):
            stats[tableType + '_pat'] = {}
        
        if not stats[tableType + '_pat'].has_key('(%s,?,?)' % (SUBJECT)):
            statStartTime = time.time()
            stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)] = CountTriples(cursor, table, [SUBJECT], [PREDICATE,OBJECT]) # may be useful if lots of queries asking for everything about a particular subject (but only if you are joining the object, etc.); suggest histogram or even average
            stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)] = CountTriples(cursor, table, [PREDICATE], [SUBJECT,OBJECT]) #NOTE: always wnat this!! Small and very useful!
            stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)] = CountTriples(cursor, table, [OBJECT], [SUBJECT,PREDICATE]) 
            stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)] = CountTriples(cursor, table, [SUBJECT,PREDICATE], [OBJECT])  # if wanted, suggest histogram
            stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)] = CountTriples(cursor, table, [PREDICATE, OBJECT], [SUBJECT]) #NOTE: 2nd most useful; but needs ~ 1/3T space; suggest histogram instead
#            stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)] = CountTriples(cursor, table, [SUBJECT, OBJECT], [PREDICATE]) #NOTE: basically useless!
            stats[tableType + '_patTime'] = time.time()-statStartTime
        
            stats['(%s,?,?)_patTime' % (SUBJECT)] += stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)]['countTime']
            stats['(?,%s,?)_patTime' % (PREDICATE)] += stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)]['countTime']
            stats['(?,?,%s)_patTime' % (OBJECT)] += stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)]['countTime']
            stats['(%s,%s,?)_patTime' % (SUBJECT, PREDICATE)] += stats[tableType + '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)]['countTime']
            stats['(?,%s,%s)_patTime' % (PREDICATE, OBJECT)] += stats[tableType + '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)]['countTime']
            stats['(%s,?,%s)_patTime' % (SUBJECT, OBJECT)] += stats[tableType + '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)]['countTime']
        
        # histograms (class(s),-,-), (-,-,class(o)), (-,p,class(o))
        for h in histogramSizes:
            if not stats[tableType + '_pat'].has_key('(%sh%s,?,?)' % (SUBJECT, h)):
                #NOTE: if using real value for a particular triple pattern, then disable the histogram version
                #NOTE: can move these out of the loop and put in different histogram sizes for each type of triple pattern (& modify in the formulas)
                stats[tableType + '_pat']['(%sh%s,?,?)' % (SUBJECT, h)] = CountTriples(cursor, table, [SUBJECT], [PREDICATE,OBJECT], [SUBJECT], h)
                stats['(%sh%s,?,?)_patTime' % (SUBJECT, h)] += stats[tableType + '_pat']['(%sh%s,?,?)' % (SUBJECT, h)]['countTime']
                stats[tableType + '_pat']['(?,?,%sh%s)' % (OBJECT, h)] = CountTriples(cursor, table, [OBJECT], [SUBJECT,PREDICATE], [OBJECT], h)
                stats['(?,?,%sh%s)_patTime' % (OBJECT,h)] += stats[tableType + '_pat']['(?,?,%sh%s)' % (OBJECT, h)]['countTime']        
                stats[tableType + '_pat']['(?,%s,%sh%s)' % (PREDICATE, OBJECT, h)] = CountTriples(cursor, table, [PREDICATE, OBJECT], [SUBJECT], [OBJECT], h)
                stats['(?,%s,%sh%s)_patTime' % (PREDICATE, OBJECT, h)] += stats[tableType + '_pat']['(?,%s,%sh%s)' % (PREDICATE, OBJECT, h)]['countTime']            
                #NOTE: may want to add a subject-predicate histogram here (if using the pattern frequently)

    # predicate co-occurrence
    if False:#doJoins:
        # Note: this is very expensive and only used by Stocker WWW2008 method! (i.e. we don't need them)
        if not stats.has_key('joinTime'):
            statStartTime = time.time()
            stats['join_s-s'] = PredicateJoinCount(cursor, realTables, SUBJECT, SUBJECT)
            stats['join_s-o'] = PredicateJoinCount(cursor, realTables, SUBJECT, OBJECT)
            stats['join_o-s'] = PredicateJoinCount(cursor, realTables, OBJECT, SUBJECT)
            stats['join_o-o'] = PredicateJoinCount(cursor, realTables, OBJECT, OBJECT)
            stats['joinTime'] = time.time()-statStartTime

    cursor.close()

    endTime = time.time()-startTime
    print('Statistics gathered in %s ms' % (endTime))
    stats['elapsedTime'] = endTime
    
    return stats
 def generateWhereClause(self,queryPattern):
     """
     Takes a query pattern (a list of quad terms -
     subject,predicate,object,context) and generates a SQL WHERE clauses
     which works in conjunction to the intersections to filter the result
     set by partial matching (by REGEX), full matching (by integer
     half-hash), and term types. For maximally efficient SELECT queries
     """
     whereClauses = []
     whereParameters = []
     asserted = dereferenceQuad(CONTEXT,queryPattern) is None
     for idx in SlotPrefixes.keys():
         queryTerm = dereferenceQuad(idx,queryPattern)
         lookupAlias = 'rt_'+SlotPrefixes[idx]
         if idx == CONTEXT and asserted:
             whereClauses.append("%s.%s_term != 'F'" % \
                                         (self,self.columnNames[idx]))
         
         if idx < len(POSITION_LIST) and isinstance(queryTerm,REGEXTerm):
             whereClauses.append("%s.lexical REGEXP "%lookupAlias+"%s")
             whereParameters.append(queryTerm)
         elif idx == CONTEXT \
                 and isinstance(queryTerm,Graph) \
                 and isinstance(queryTerm.identifier,REGEXTerm):
             whereClauses.append("%s.lexical REGEXP "%lookupAlias+"%s")
             whereParameters.append(queryTerm.identifier)
         elif idx < len(POSITION_LIST) and queryTerm is not Any:
             if self.columnNames[idx]:
                 
                 if isinstance(queryTerm,list):
                     whereClauses.append("%s.%s" % \
                             (self,self.columnNames[idx])+" in (%s)" % \
                                 ','.join([
                                     '%s' for item in range(len(queryTerm))
                                     ]))
                     whereParameters.extend(
                       [normalizeValue(item, term2Letter(item),
                                       self.useSignedInts)
                        for item in queryTerm])
                 else:
                     whereClauses.append("%s.%s" % \
                                     (self,self.columnNames[idx])+" = %s")
                     whereParameters.append(normalizeValue(
                       queryTerm, term2Letter(queryTerm),
                       self.useSignedInts))
             
             if not idx in self.hardCodedResultTermsTypes \
                     and self.termEnumerations[idx] \
                     and not isinstance(queryTerm,list):
                 whereClauses.append("%s.%s_term" % \
                             (self,self.columnNames[idx])+" = %s")
                 whereParameters.append(term2Letter(queryTerm))
         elif idx >= len(POSITION_LIST) \
                 and len(self.columnNames) > len(POSITION_LIST) \
                 and queryTerm is not None:
             compVal = idx == DATATYPE_INDEX and normalizeValue(
               queryTerm, term2Letter(queryTerm),
               self.useSignedInts) or queryTerm
             whereClauses.append("%s.%s" % \
                                 (self,self.columnNames[idx][0])+" = %s")
             whereParameters.append(compVal)
     
     return ' AND '.join(whereClauses),whereParameters
Example #21
0
 def updateIdentifierQueue(self, termList):
     for term, termType in termList:
         md5Int = normalizeValue(term, termType, self.useSignedInts)
         self.hashUpdateQueue[md5Int] = self.normalizeTerm(term)
Example #22
0
def CountTriples(cursor, table, specifiedColumns, variableColumns):
    d = {}
    
    specCols = []
    hardCodedSpecCols = []
    varCols = []
    # hardCodedVarCols = [] # not needed
    indexPos = {}
    
    for i in specifiedColumns:
        if table.columnNames[i] != None:
            indexPos[i] = ('spec',len(specCols))
            specCols.append(table.columnNames[i])
        else:
            indexPos[i] = ('hard',len(hardCodedSpecCols))
            hardCodedSpecCols.append(normalizeValue(table.hardCodedResultFields[i],'U')) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
    for i in variableColumns:
        indexPos[i] = ('var', -1)
        if table.columnNames[i] != None:
            varCols.append(table.columnNames[i])
        # else
        #     hardCodedVarCols.append(normalizeValue(table.hardCodedResultFields[i],'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
    
    #Assumes column lists in (s,p,o) order
    if len(specCols) == 0:
        cursor.execute("""
            SELECT COUNT(*) AS tripleCount
            FROM %s;""" % (table))
    if len(specCols) == 1:
        cursor.execute("""
            SELECT %s AS givenCol, COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s;""" % (specCols[0], table, specCols[0]))
        # for (givenCol,tripleCount) in cursor.fetchall():
        #     d['%s=%s'%(specCols[0],givenCol) ] = tripleCount
    elif len(specCols) == 2:
        cursor.execute("""
            SELECT %s AS givenCol1, %s AS givenCol2, COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s, %s;""" % (specCols[0], specCols[1], table, specCols[0], specCols[1]))
        # for (givenCol,tripleCount) in cursor.fetchall():
        #     d['%s_triples_%s=%s'%(table,givenCols[0],givenCol) ] = tripleCount
    elif len(specCols) == 3:
        cursor.execute("""
            SELECT %s AS givenCol1, %s AS givenCol2, %s AS givenCol3 COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s, %s, %s;""" % (specCols[0], specCols[1], specCols[2], table, specCols[0], specCols[1], specCols[2]))
    
    for t in cursor.fetchall():
        key = []
        for i in (SUBJECT,PREDICATE,OBJECT,CONTEXT):
            if indexPos.has_key(i):
                (type,pos) = indexPos[i]
                if type == 'spec':
                    key.append('%s=%s'%(i,t[pos]))
                elif type == 'hard':
                    key.append('%s=%s'%(i,hardCodedSpecCols[pos]))
        d[','.join(key)] = t[len(t)-1]
    
    names = []
    for i in (SUBJECT,PREDICATE,OBJECT,CONTEXT):
        if indexPos.has_key(i):
            (type,pos) = indexPos[i]
            if type == 'var':
                names.append('?')
            elif type == 'spec' or type == 'hard':
                names.append(str(i))
    
    print('    Entries for triple pattern (%s) index: %s' % (','.join(names), len(d)))
    return d
Example #23
0
 def updateIdentifierQueue(self,termList):
     for term,termType in termList:
         md5Int = normalizeValue(term, termType, self.useSignedInts)
         self.hashUpdateQueue[md5Int]=self.normalizeTerm(term)
Example #24
0
    def generateWhereClause(self, queryPattern):
        """
        Takes a query pattern (a list of quad terms -
        subject,predicate,object,context) and generates a SQL WHERE clauses
        which works in conjunction to the intersections to filter the result
        set by partial matching (by REGEX), full matching (by integer
        half-hash), and term types. For maximally efficient SELECT queries
        """
        whereClauses = []
        whereParameters = []
        asserted = dereferenceQuad(CONTEXT, queryPattern) is None
        for idx in SlotPrefixes.keys():
            queryTerm = dereferenceQuad(idx, queryPattern)
            lookupAlias = 'rt_' + SlotPrefixes[idx]
            if idx == CONTEXT and asserted:
                whereClauses.append("%s.%s_term != 'F'" % \
                                            (self,self.columnNames[idx]))

            if idx < len(POSITION_LIST) and isinstance(queryTerm, REGEXTerm):
                whereClauses.append("%s.lexical REGEXP " % lookupAlias + "%s")
                whereParameters.append(queryTerm)
            elif idx == CONTEXT \
                    and isinstance(queryTerm,Graph) \
                    and isinstance(queryTerm.identifier,REGEXTerm):
                whereClauses.append("%s.lexical REGEXP " % lookupAlias + "%s")
                whereParameters.append(queryTerm.identifier)
            elif idx < len(POSITION_LIST) and queryTerm is not Any:
                if self.columnNames[idx]:

                    if isinstance(queryTerm, list):
                        whereClauses.append("%s.%s" % \
                                (self,self.columnNames[idx])+" in (%s)" % \
                                    ','.join([
                                        '%s' for item in range(len(queryTerm))
                                        ]))
                        whereParameters.extend([
                            normalizeValue(item, term2Letter(item),
                                           self.useSignedInts)
                            for item in queryTerm
                        ])
                    else:
                        whereClauses.append("%s.%s" % \
                                        (self,self.columnNames[idx])+" = %s")
                        whereParameters.append(
                            normalizeValue(queryTerm, term2Letter(queryTerm),
                                           self.useSignedInts))

                if not idx in self.hardCodedResultTermsTypes \
                        and self.termEnumerations[idx] \
                        and not isinstance(queryTerm,list):
                    whereClauses.append("%s.%s_term" % \
                                (self,self.columnNames[idx])+" = %s")
                    whereParameters.append(term2Letter(queryTerm))
            elif idx >= len(POSITION_LIST) \
                    and len(self.columnNames) > len(POSITION_LIST) \
                    and queryTerm is not None:
                compVal = idx == DATATYPE_INDEX and normalizeValue(
                    queryTerm, term2Letter(queryTerm),
                    self.useSignedInts) or queryTerm
                whereClauses.append("%s.%s" % \
                                    (self,self.columnNames[idx][0])+" = %s")
                whereParameters.append(compVal)

        return ' AND '.join(whereClauses), whereParameters
Example #25
0
def GarbageCollectionQUERY(idHash,valueHash,aBoxPart,binRelPart,litPart):
    """
    Performs garbage collection on interned identifiers and their references.
    Joins the given KB partitions against the identifiers and values and
    removes the 'danglers'. This must be performed after every removal of an
    assertion and so becomes a primary bottleneck
    """
    purgeQueries = ["drop temporary table if exists danglingIds"]
    rdfTypeInt = normalizeValue(RDF.type,'U')
    idHashKeyName = idHash.columns[0][0]
    valueHashKeyName = valueHash.columns[0][0]
    idHashJoinees    = [aBoxPart,binRelPart,litPart]
    idJoinClauses = []
    idJoinColumnCandidates = []
    explicitJoins = []
    for part in idHashJoinees:
        partJoinClauses = []
        for colName in part.columnNames:
            if part.columnNames.index(colName) >= 4:
                colName,sqlType,index = colName
                if sqlType.lower()[:6]=='bigint':
                    partJoinClauses.append("%s.%s = %s.%s" % \
                            (part,colName,idHash,idHashKeyName))
                    idJoinColumnCandidates.append("%s.%s" % (part,colName))
            elif colName:
                partJoinClauses.append("%s.%s = %s.%s" % \
                                (part,colName,idHash,idHashKeyName))
                idJoinColumnCandidates.append("%s.%s" % (part,colName))
        explicitJoins.append("left join %s on (%s)" % \
                            (part,' or '.join(partJoinClauses)))
        idJoinClauses.extend(partJoinClauses)
    
    intersectionClause = " and ".join([col + " is NULL"
                                        for col in idJoinColumnCandidates])
    idGCQuery = IDENTIFIER_GARBAGE_COLLECTION_SQL%(
        idHash,
        idHashKeyName,
        idHash,
        ' '.join(explicitJoins),
        intersectionClause,
        idHash,
        idHashKeyName,
        rdfTypeInt
    )
    
    idPurgeQuery = PURGE_KEY_SQL % \
            (idHash,idHash,idHashKeyName,idHash,idHashKeyName)
    purgeQueries.append(idGCQuery)
    purgeQueries.append(idPurgeQuery)
    
    partJoinClauses = []
    idJoinColumnCandidates = []
    explicitJoins = []
    partJoinClauses.append("%s.%s = %s.%s" % \
            (litPart,litPart.columnNames[OBJECT],valueHash,valueHashKeyName))
    idJoinColumnCandidates.append("%s.%s" % \
            (litPart,litPart.columnNames[OBJECT]))
    
    intersectionClause = " and ".join([col + " is NULL" 
                            for col in idJoinColumnCandidates])
    valueGCQuery = VALUE_GARBAGE_COLLECTION_SQL%(
        valueHash,
        valueHashKeyName,
        valueHash,
        "left join %s on (%s)"%(litPart,' or '.join(partJoinClauses)),
        intersectionClause
    )
    
    valuePurgeQuery = PURGE_KEY_SQL % \
        (valueHash,valueHash,valueHashKeyName,valueHash,valueHashKeyName)
    purgeQueries.append("drop temporary table if exists danglingIds")
    purgeQueries.append(valueGCQuery)
    purgeQueries.append(valuePurgeQuery)
    return purgeQueries
Example #26
0
def CountTriples(cursor, table, specifiedColumns, variableColumns, hashColumns=[], hashBuckets=200):
    d = {}
    statStartTime = time.time()
    
    specCols = []
    hardCodedSpecCols = []
    varCols = []
    #hardCodedVarCols = [] # not needed
    indexPos = {}
    distinctCount = []
        
    for i in specifiedColumns:
        if table.columnNames[i] != None:
            indexPos[i] = ('spec',len(specCols))
            
            if not i in hashColumns:        
                specCols.append(table.columnNames[i])  # use all values of this variable column
                if len(hashColumns) > 0:
                    distinctCount.append(table.columnNames[i])                                  
            else:
                # use k hash buckets as a string histogram
                specCols.append("MOD(%s,%s)"%(table.columnNames[i],hashBuckets))
                distinctCount.append(table.columnNames[i])              
        else:
            indexPos[i] = ('hard',len(hardCodedSpecCols))
            hardCodedSpecCols.append(normalizeValue(table.hardCodedResultFields[i],'U')) #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
    for i in variableColumns:
        indexPos[i] = ('var', -1)
        if table.columnNames[i] != None:    
             varCols.append(table.columnNames[i])
        #else
        #    hardCodedVarCols.append(normalizeValue(table.hardCodedResultFields[i],'U') #BE: assuming this will only be the case for a URI (i.e. for rdf:Type)
        
    distinctClause = ""
    if len(distinctCount) > 0:
        distinctClause = " COUNT(DISTINCT %s) AS distinctCount," % (
            ",".join(distinctCount))
        
    #Assumes column lists in (s,p,o) order
    if len(specCols) == 0:
        cursor.execute(""" 
            SELECT COUNT(*) AS tripleCount
            FROM %s;""" % (table))        
    if len(specCols) == 1: 
        cursor.execute(""" 
            SELECT %s AS givenCol, %s COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s;""" % (specCols[0], distinctClause, table, specCols[0]))
#        for (givenCol,tripleCount) in cursor.fetchall():
#            d['%s=%s'%(specCols[0],givenCol) ] = tripleCount        
    elif len(specCols) == 2:
        cursor.execute(""" 
            SELECT %s AS givenCol1, %s AS givenCol2, %s COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s, %s;""" % (
                    specCols[0], specCols[1], distinctClause, 
                    table, specCols[0], specCols[1]))
#        for (givenCol,tripleCount) in cursor.fetchall():
#            d['%s_triples_%s=%s'%(table,givenCols[0],givenCol) ] = tripleCount
    elif len(specCols) == 3:
        cursor.execute(""" 
            SELECT %s AS givenCol1, %s AS givenCol2, %s AS givenCol3, %s COUNT(*) AS tripleCount
            FROM %s
            GROUP BY %s, %s, %s;""" % (
                specCols[0], specCols[1], specCols[2], distinctClause, 
                table, specCols[0], specCols[1], specCols[2]))
                
    for t in cursor.fetchall():
        key = []
        for i in (SUBJECT,PREDICATE,OBJECT,CONTEXT):  
            if indexPos.has_key(i):
                (type,pos) = indexPos[i]
                if type == 'spec':
                    key.append('%s=%s' % (i, t[pos]))
                elif type == 'hard':
                    key.append('%s=%s' % (i, hardCodedSpecCols[pos]))
        
        d[','.join(key)] = t[len(t)-1]
        if len(distinctCount) > 0: 
            # for histograms, also get the # distinct values in the bucket
            d[','.join(key)+'dist'] = t[len(t)-2]
 
    names = []        
    for i in (SUBJECT, PREDICATE, OBJECT, CONTEXT):            
        if indexPos.has_key(i):
            (type,pos) = indexPos[i]
            if type == 'var':
                names.append('?')
            elif type == 'spec' or type == 'hard':
                names.append(str(i))                
       
    d['countTime'] = time.time()-statStartTime
       
    if len(distinctCount) < 1: 
        print('    Entries for triple pattern (%s) index: %s' % (','.join(names), len(d)))
    else:
        print('    Entries for histogram triple pattern (%s) index: %s (%s counts)' % (
            ','.join(names), (len(d)-1)/2, len(d)-1))
    return d
Example #27
0
def GarbageCollectionQUERY(idHash, valueHash, aBoxPart, binRelPart, litPart):
    """
    Performs garbage collection on interned identifiers and their references.
    Joins the given KB partitions against the identifiers and values and
    removes the 'danglers'. This must be performed after every removal of an
    assertion and so becomes a primary bottleneck
    """
    purgeQueries = ["drop temporary table if exists danglingIds"]
    rdfTypeInt = normalizeValue(RDF.type, 'U')
    idHashKeyName = idHash.columns[0][0]
    valueHashKeyName = valueHash.columns[0][0]
    idHashJoinees = [aBoxPart, binRelPart, litPart]
    idJoinClauses = []
    idJoinColumnCandidates = []
    explicitJoins = []
    for part in idHashJoinees:
        partJoinClauses = []
        for colName in part.columnNames:
            if part.columnNames.index(colName) >= 4:
                colName, sqlType, index = colName
                if sqlType.lower()[:6] == 'bigint':
                    partJoinClauses.append("%s.%s = %s.%s" % \
                            (part,colName,idHash,idHashKeyName))
                    idJoinColumnCandidates.append("%s.%s" % (part, colName))
            elif colName:
                partJoinClauses.append("%s.%s = %s.%s" % \
                                (part,colName,idHash,idHashKeyName))
                idJoinColumnCandidates.append("%s.%s" % (part, colName))
        explicitJoins.append("left join %s on (%s)" % \
                            (part,' or '.join(partJoinClauses)))
        idJoinClauses.extend(partJoinClauses)

    intersectionClause = " and ".join(
        [col + " is NULL" for col in idJoinColumnCandidates])
    idGCQuery = IDENTIFIER_GARBAGE_COLLECTION_SQL % (
        idHash, idHashKeyName, idHash, ' '.join(explicitJoins),
        intersectionClause, idHash, idHashKeyName, rdfTypeInt)

    idPurgeQuery = PURGE_KEY_SQL % \
            (idHash,idHash,idHashKeyName,idHash,idHashKeyName)
    purgeQueries.append(idGCQuery)
    purgeQueries.append(idPurgeQuery)

    partJoinClauses = []
    idJoinColumnCandidates = []
    explicitJoins = []
    partJoinClauses.append("%s.%s = %s.%s" % \
            (litPart,litPart.columnNames[OBJECT],valueHash,valueHashKeyName))
    idJoinColumnCandidates.append("%s.%s" % \
            (litPart,litPart.columnNames[OBJECT]))

    intersectionClause = " and ".join(
        [col + " is NULL" for col in idJoinColumnCandidates])
    valueGCQuery = VALUE_GARBAGE_COLLECTION_SQL % (
        valueHash, valueHashKeyName, valueHash, "left join %s on (%s)" %
        (litPart, ' or '.join(partJoinClauses)), intersectionClause)

    valuePurgeQuery = PURGE_KEY_SQL % \
        (valueHash,valueHash,valueHashKeyName,valueHash,valueHashKeyName)
    purgeQueries.append("drop temporary table if exists danglingIds")
    purgeQueries.append(valueGCQuery)
    purgeQueries.append(valuePurgeQuery)
    return purgeQueries
Example #28
0
def GetDatabaseStats(graph):
    print('Gathering statistics...')
    startTime = time.time()

    stats = dict()
    stats[
        'triples'] = 0  #len(graph) #ISSUE: len(graph) only gives count for default graph???

    stats['cacheName'] = graph.store.identifier + "-" + \
                        str(normalizeValue(graph.store.configuration, "L"))
    stats['storeName'] = graph.store.identifier
    stats['internedId'] = graph.store._internedId
    stats['config'] = graph.store.configuration

    tables = dict(type=graph.store.aboxAssertions,
                  lit=graph.store.literalProperties,
                  rel=graph.store.binaryRelations,
                  all=graph.store._internedId + '_all')
    # FIXME Unused code
    realTables = dict(type=graph.store.aboxAssertions,
                      lit=graph.store.literalProperties,
                      rel=graph.store.binaryRelations)
    # columnNames[OBJECT]

    cursor = graph.store._db.cursor()

    # distinct num. of subjects, predicates, & objects
    tableType = 'all'
    statStartTime = time.time()
    stats['subjects'] = CountDistint(cursor, tables[tableType], 'subject')
    stats['predicates'] = CountDistint(cursor, tables[tableType], 'predicate')
    stats['objects'] = CountDistint(cursor, tables[tableType], 'object')
    stats['distTime'] = time.time() - statStartTime

    for tableType in ['lit', 'rel', 'type']:

        table = tables[tableType]

        # total # triples
        cursor.execute(""" SELECT COUNT(*) FROM %s """ % table)
        triples = cursor.fetchone()[0]
        stats[tableType + '_triples'] = triples
        stats['triples'] = stats['triples'] + triples

        print('  Processing table %s: %s triples...' % (tableType, triples))

        # distinct num. of subjects, predicates, & objects
        statStartTime = time.time()
        stats[tableType + '_subjects'] = CountDistint(
            cursor, table, table.columnNames[SUBJECT])
        stats[tableType + '_predicates'] = CountDistint(
            cursor, table, table.columnNames[PREDICATE])
        stats[tableType + '_objects'] = CountDistint(cursor, table,
                                                     table.columnNames[OBJECT])
        stats[tableType + '_distTime'] = time.time() - statStartTime

        # subject/object counts for predicates
        statStartTime = time.time()
        stats[tableType + '_colDist'] = {}
        stats[tableType + '_colDist']['obj_for_pred'] = CountDistinctForColumn(
            cursor, table, PREDICATE, OBJECT)
        stats[tableType + '_colDist']['sub_for_pred'] = CountDistinctForColumn(
            cursor, table, PREDICATE, SUBJECT)
        stats[tableType + '_colDistTime'] = time.time() - statStartTime

        # triple pattern occurrence counts
        statStartTime = time.time()
        stats[tableType + '_pat'] = {}
        stats[tableType + '_pat']['(%s,?,?)' % (SUBJECT)] = CountTriples(
            cursor, table, [SUBJECT], [PREDICATE, OBJECT])
        stats[tableType + '_pat']['(?,%s,?)' % (PREDICATE)] = CountTriples(
            cursor, table, [PREDICATE], [SUBJECT, OBJECT])
        stats[tableType + '_pat']['(?,?,%s)' % (OBJECT)] = CountTriples(
            cursor, table, [OBJECT], [SUBJECT, PREDICATE])
        stats[tableType +
              '_pat']['(%s,%s,?)' % (SUBJECT, PREDICATE)] = CountTriples(
                  cursor, table, [SUBJECT, PREDICATE], [OBJECT])
        stats[tableType +
              '_pat']['(?,%s,%s)' % (PREDICATE, OBJECT)] = CountTriples(
                  cursor, table, [PREDICATE, OBJECT], [SUBJECT])
        stats[tableType +
              '_pat']['(%s,?,%s)' % (SUBJECT, OBJECT)] = CountTriples(
                  cursor, table, [SUBJECT, OBJECT], [PREDICATE])
        stats[tableType + '_patTime'] = time.time() - statStartTime

    # predicate co-occurrence
    statStartTime = time.time()
    # stats['join_s-s'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, SUBJECT)
    # stats['join_s-o'] = PredicateJoinCount(cursor, graph, realTables, SUBJECT, OBJECT)
    # stats['join_o-s'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, SUBJECT)
    # stats['join_o-o'] = PredicateJoinCount(cursor, graph, realTables, OBJECT, OBJECT)
    stats['joinTime'] = time.time() - statStartTime

    cursor.close()

    endTime = time.time() - startTime
    print('Statistics gathered in %s ms' % (endTime))
    stats['elapsedTime'] = endTime

    return stats