Beispiel #1
0
class MannheimAtomizer(object):
    """
        Atomizes table based on subject column into N
        tables, where N is (number of columns - 1)
    """
    def __init__(self):
        self.subjectColumnIdentifier = SimpleIdentifier()
        self.logger = Logger().getLogger(__name__)

    def atomizeTable(self, table):
        try:
            subjectColumnNumber = self.subjectColumnIdentifier.identifySubjectColumn(table)
        except SubjectColumnNotFoundError as e:
            self.logger.error("Subject column not found", exc_info=True)
            subjectColumnNumber = 0
        relations = table.getTable()
        atomicTables = []
        subjectCol = relations[subjectColumnNumber]
        for index in range(len(relations) - 1):
            if index != subjectColumnNumber:
                otherCol = relations[index]
                atomicTable = numpy.array([subjectCol, otherCol])
                atomicTables.append(atomicTable)
            else:
                continue
        if(len(atomicTables) < 1):
            raise CouldNotAtomizeError("Table could not be atomized!")
            self.logger.error("Table could not be atomized!")
            self.logger.error("%s" % (relations,))
        return atomicTables
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.dlIdentifier = DistantSupervisionIdentifier()
Beispiel #3
0
 def __init__(self):
     self.subjectColumnIdentifier = SimpleIdentifier()
     self.logger = Logger().getLogger(__name__)
Beispiel #4
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.agdistisIdentifier = AgdistisIdentifier()
     self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri)
     self.dbpediaSparql.setReturnFormat(JSON)
     self.propertySearch = PropertySearchDbpediaSparql()
class SimpleCachePropertyMapper(object):
    """
        Performs quite poorly:
        The current precision is 47,6% and what can be achieved is 71% (maximum)
    """

    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.dlIdentifier = DistantSupervisionIdentifier()

    def mapProperties(self, table):
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        cacheFile = os.path.join(cacheFolder, tableId + ".relations.cache")
        subjectColumn = self.dlIdentifier.identifySubjectColumn(table)

        self.logger.debug("Identifying properties for a table %s"%(tableId))

        if(os.path.exists(cacheFile)):
            relations = pickle.load(open(cacheFile, 'rb'))
        else:
            raise RelationsDataStructureNotFound("Could not found Rels structure for %s"%(str(tableId),))

        self.executionTimeFull = 0
        self.startTime = time.time()
        #init properties
        nonSubjectColumns = range(0,len(relations[0]))
        nonSubjectColumns.remove(subjectColumn)
        properties = collections.defaultdict(dict)
        for nonSubjectColumn in nonSubjectColumns:
            properties[nonSubjectColumn] = []

        #Aggregate all properties
        for row in relations:
            for nonSubjectColumn in nonSubjectColumns:
                #This is properties for atomic table with h_i, i = nonSubjectColumn
                try:
                    properties[nonSubjectColumn].append(row[subjectColumn][nonSubjectColumn])
                except:
                    pass

        #Flatten the properties
        topProperties = []
        for nonSubjectColumn in nonSubjectColumns:
            properties[nonSubjectColumn] = [item for sublist in properties[nonSubjectColumn] for item in sublist]
            #and get the maximum
            try:
                topProperty = Counter(properties[nonSubjectColumn]).most_common(1)[0][0]
                topProperties.append((topProperty,nonSubjectColumn))
            except IndexError:
                self.logger.debug("No property identified for column %s"%(nonSubjectColumn))

        self.endTime = time.time()
        self.executionTimeFull = self.endTime - self.startTime

        #check if seed properties contain properties we are trying to find
        self.seedListContains = 0
        for _property in table.properties:
            if _property['uri'] in properties[_property['columnIndex']]:
                self.seedListContains += 1

        return topProperties
Beispiel #6
0
class SimplePropertyMapper(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistisIdentifier = AgdistisIdentifier()
        self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri)
        self.dbpediaSparql.setReturnFormat(JSON)
        self.propertySearch = PropertySearchDbpediaSparql()

    def parseResults(self, results, variableName="property"):
        """
            Refactor in a separate class
        """
        properties = []
        for result in results:
            properties.append(result[variableName]["value"])
        return properties

    def getClassForEntity(self, entity):
        """
            Refactor in a separate class
        """
        self.dbpediaSparql.setQuery(
            u"""
            SELECT DISTINCT ?class
            WHERE {
                <%s> a ?class .
            }
        """
            % (entity,)
        )
        results = self.dbpediaSparql.query().convert()["results"]["bindings"]
        return self.parseResults(results, variableName="class")

    def getEntities(self, tableId):
        entitiesCacheFile = os.path.join(cacheFolder, tableId + ".entities.cache")
        if os.path.exists(entitiesCacheFile):
            return pickle.load(open(entitiesCacheFile, "rb"))
        else:
            raise EntitiesDataStructureNotFound(
                "Entities data structure not available. Did you run subject column identification?"
            )

    def getEntitiesWithClasses(self, tableId):
        entities = self.getEntities(tableId)
        entitiesWithClassesCache = os.path.join(cacheFolder, tableId + ".entities.with.classes.cache")
        if os.path.exists(entitiesWithClassesCache):
            entities = pickle.load(open(entitiesWithClassesCache, "rb"))
        else:
            for rowIndex, entityRow in enumerate(entities):
                for columnIndex, entity in enumerate(entityRow):
                    for entityIndex, _entity in enumerate(entity):
                        entity[entityIndex] = (self.getClassForEntity(_entity), _entity)
            pickle.dump(entities, open(entitiesWithClassesCache, "wb"))
        return entities

    def getClasses(self, entities, numberOfColumns):
        classes = [[]] * numberOfColumns
        for rowIndex, entityRow in enumerate(entities):
            for columnIndex, entity in enumerate(entityRow):
                for entityIndex, _entity in enumerate(entity):
                    (_class, entityUrl) = _entity
                    try:
                        classes[columnIndex].append(_class)
                    except BaseException as e:
                        print "%s" % (str(e),)
        return classes

    def getMainClassForSubjectColumn(self, classes, subjectColumn):
        classesSubjectColumn = [item for sublist in classes[subjectColumn] for item in sublist]
        try:
            classCount = len(classesSubjectColumn)
            (mainClass, mainClassCount) = Counter(classesSubjectColumn).most_common(1)[0]
            mainClassScore = float(mainClassCount) / classCount * 100
        except IndexError:
            self.logger.debug("Main class could not be identified")
            mainClass = ""
        return mainClass

    def filterNonMainClassEntities(self, entities, mainClass, subjectColumn):
        for rowIndex, entityRow in enumerate(entities):
            for columnIndex, entity in enumerate(entityRow):
                if columnIndex != subjectColumn:
                    continue
                for entityIndex, _entity in enumerate(entity):
                    (_class, entityUrl) = _entity
                    if not mainClass in _class:
                        entities[rowIndex][columnIndex][entityIndex] = (None, None)
        return entities

    def findProperties(self, tableId, tableData, entities, subjectColumn, nonSubjectColumns):
        propertyCache = os.path.join(cacheFolder, tableId + ".property.star.cache")
        properties = collections.defaultdict(dict)
        if os.path.exists(propertyCache):
            properties = pickle.load(open(propertyCache, "rb"))
        else:
            for rowIndex, entityRow in enumerate(entities):
                for columnIndex, entity in enumerate(entityRow):
                    if columnIndex != subjectColumn:
                        continue
                    if len(entity) <= 0:
                        continue
                    for entityIndex, _entity in enumerate(entity):
                        (_class, entityUrl) = _entity
                        if entityUrl != None:
                            for nonSubjectColumn in nonSubjectColumns:
                                cellValue = tableData[rowIndex][nonSubjectColumn]
                                properties[rowIndex][nonSubjectColumn] = self.propertySearch.uriLiteralSearch(
                                    entityUrl, cellValue
                                )
            pickle.dump(properties, open(propertyCache, "wb"))
        return properties

    def aggregateProperties(self, properties, nonSubjectColumns):
        propertiesAggregate = collections.defaultdict(dict)
        for nonSubjectColumn in nonSubjectColumns:
            propertiesAggregate[nonSubjectColumn] = []
        for row in properties:
            for nonSubjectColumn in nonSubjectColumns:
                propertiesAggregate[nonSubjectColumn].append(properties[row][nonSubjectColumn])

        for nonSubjectColumn in nonSubjectColumns:
            propertiesAggregate[nonSubjectColumn] = [
                item for sublist in propertiesAggregate[nonSubjectColumn] for item in sublist
            ]

        return propertiesAggregate

    def getTopProperties(self, propertiesAggregate, nonSubjectColumns, threshold):
        topProperties = []
        for nonSubjectColumn in nonSubjectColumns:
            try:
                (topProperty, support) = Counter(propertiesAggregate[nonSubjectColumn]).most_common(1)[0]
                # In percents
                support = (float(support) / len(propertiesAggregate[nonSubjectColumn])) * 100
                if support > threshold:
                    topProperties.append({"uri": topProperty, "columnIndex": nonSubjectColumn})
            except IndexError:
                self.logger.debug("No property identified for column %s" % (nonSubjectColumn))
        return topProperties

    def calculateScores(self, propertiesAggregate, nonSubjectColumns):
        scores = collections.defaultdict(dict)
        for nonSubjectColumn in nonSubjectColumns:
            scores[nonSubjectColumn] = []

        for nonSubjectColumn in nonSubjectColumns:
            scores[nonSubjectColumn] = Counter(propertiesAggregate[nonSubjectColumn])

        return scores

    def getScores(self, table, rowsToDisambiguate=20, threshold=10, support=0, connectivity=0):
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])
        subjectColumn = table.subjectColumn
        if subjectColumn == None or subjectColumn == -1:
            return []

        nonSubjectColumns = range(0, len(tableData[0]))
        nonSubjectColumns.remove(subjectColumn)

        self.logger.debug("Identifying properties for a table %s" % (tableId))

        entities = self.getEntitiesWithClasses(tableId)
        classes = self.getClasses(entities, numberOfColumns)
        mainClass = self.getMainClassForSubjectColumn(classes, subjectColumn)
        entities = self.filterNonMainClassEntities(entities, mainClass, subjectColumn)
        properties = self.findProperties(tableId, tableData, entities, subjectColumn, nonSubjectColumns)
        propertiesAggregate = self.aggregateProperties(properties, nonSubjectColumns)
        propertyScores = self.calculateScores(propertiesAggregate, nonSubjectColumns)

        return propertyScores

    def mapProperties(self, table, rowsToDisambiguate=20, threshold=10, support=0, connectivity=0):
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])
        subjectColumn = table.subjectColumn
        if subjectColumn == None or subjectColumn == -1:
            return []

        nonSubjectColumns = range(0, len(tableData[0]))
        nonSubjectColumns.remove(subjectColumn)

        self.logger.debug("Identifying properties for a table %s" % (tableId))

        entities = self.getEntitiesWithClasses(tableId)
        classes = self.getClasses(entities, numberOfColumns)
        mainClass = self.getMainClassForSubjectColumn(classes, subjectColumn)
        entities = self.filterNonMainClassEntities(entities, mainClass, subjectColumn)
        properties = self.findProperties(tableId, tableData, entities, subjectColumn, nonSubjectColumns)
        propertiesAggregate = self.aggregateProperties(properties, nonSubjectColumns)
        propertyScores = self.calculateScores(propertiesAggregate, nonSubjectColumns)

        topProperties = self.getTopProperties(propertiesAggregate, nonSubjectColumns, threshold)

        return topProperties
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.agdistis = AgdistisTableIdentifier()
     self.supportIdentifier = SupportIdentifier()
class DistantSupervisionIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistis = AgdistisTableIdentifier()
        self.supportIdentifier = SupportIdentifier()

    def identifySubjectColumn(
        self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0
    ):
        """
            rowsToAnalyze -- how many rows should be evaluated
            rowsFromCache -- can be used to reduce number of rows to be read from cache
            connectivity -- a number of relations subject column should have at least (absolute number)
            threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows
        """
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])

        self.logger.debug(tableId)

        self.executionStartTimePoint = 0
        self.executionEndTimePoint = 0
        self.executionTimeFull = 0
        self.executionTimePure = 0  # without querying and disambiguation
        self.queryTime = 0
        self.agdistisTime = 0

        self.executionStartTimePoint = time.time()
        # identify entities
        # TODO: get the score from agdistis
        agdistisStartTimePoint = time.time()
        entities = self.agdistis.disambiguateTable(table)
        agdistisEndTimePoint = time.time()
        self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint

        # TODO: rename columnScores to supports
        columnScores = self.supportIdentifier.calculateSupport(entities)
        # Support based approach ends here: refactor into class
        relations = self.propertyTableSearch.findRelationsForTable(table, entities)

        # Make just a connectivity approach!!!

        # Calculate the connectivity for all the rows and then take average!
        # What we have a boolean classifier
        # Linear combination is better
        # Ten cross fold validation (or inverse)
        # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support
        # For the combination -->

        import ipdb

        ipdb.set_trace()

        subjectColumnScores = [0] * numberOfColumns
        for subjectColumn in subjectColumns:
            if subjectColumn != None:
                subjectColumnScores[subjectColumn] += 1

        # Normalize
        for columnIndex, subjectColumnScore in enumerate(subjectColumnScores):
            subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100

        import ipdb

        ipdb.set_trace()
        # WRONG!!!!
        # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold]

        self.executionEndTimePoint = time.time()
        self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint
        self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime

        if len(subjectColumn) <= 0:
            return None
        else:
            return subjectColumn[0]
Beispiel #9
0
class PropertySearchDbpediaSparql(object):
    """
        This class takes two entities (URI and URI/Literal)
        And returns all properties which (possibly) connects them
    """
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri)
        self.dbpediaSparql.setReturnFormat(JSON)

    def search(self, s, o):
        if(o.startswith('http')):
            return self.uriUriSimple(s, o)
        else:
            return self.uriLiteralSimple(s, o)

    def uriLiteralSearch(self, s, o):
        properties = []
        try:
            o = o.decode('utf-8')
        except UnicodeDecodeError as e:
            self.logger.debug("Could not decode o for uriLiteralSearch")
            self.logger.debug(str(e))
            o = ""
        properties.append(self.uriLiteralSimple(s,o))
        properties.append(self.uriLiteralRegex(s,o))
        properties.append(self.uriLiteralRegexReverse(s,o))
        properties = [item for sublist in properties for item in sublist]
        return list(set(properties))

    def uriUriSearch(self, s, o):
        properties = []
        properties.append(self.uriUriSimple(s,o))
        return properties

    def uriUriSimple(self, s, o):
        self.dbpediaSparql.setQuery(u"""
            SELECT DISTINCT ?property
            WHERE { <%s> ?property <%s> .}
        """ % (s, o,))
        self.queryDebugMessage("uriUriSimple", s, o, self.dbpediaSparql.queryString)
        results = self.dbpediaSparql.query().convert()['results']['bindings']
        return self.parseResults(results)

    def uriLiteralSimple(self, s, o):
        o = self.clearLiteral(o)
        if o == "" or o == None:
            return []
        self.dbpediaSparql.setQuery(u"""
            SELECT DISTINCT ?property
            WHERE { <%s> ?property "%s"@en .}
        """ % (s, o,))
        self.queryDebugMessage("uriLiteralSimple", s, o, self.dbpediaSparql.queryString)
        results = self.dbpediaSparql.query().convert()['results']['bindings']
        return self.parseResults(results)

    def uriLiteralRegex(self, s, o):
        o = self.clearLiteral(o)
        if o == "" or o == None:
            return []
        self.dbpediaSparql.setQuery(u"""
            SELECT DISTINCT ?property
            WHERE {
                <%s> ?property ?o .
                FILTER regex(?o, ".*%s.*", "i")
            }
        """ % (s, o,))
        self.queryDebugMessage("uriLiteralRegex", s, o, self.dbpediaSparql.queryString)
        results = self.dbpediaSparql.query().convert()['results']['bindings']
        return self.parseResults(results)

    def uriLiteralRegexReverse(self, s, o):
        o = self.clearLiteral(o)
        if o == "" or o == None:
            return []
        self.dbpediaSparql.setQuery(u"""
            SELECT DISTINCT ?property
            WHERE {
                ?o ?property <%s> .
                FILTER regex(?o, ".*%s.*", "i")
            }
        """ % (s, o,))
        self.queryDebugMessage("uriLiteralRegexReverse", s, o, self.dbpediaSparql.queryString)
        results = self.dbpediaSparql.query().convert()['results']['bindings']
        return self.parseResults(results)

    def uriLiteralPathRegex(self, s, o):
        """
            Due to small diameter of a graph looking for any pathes will lead to noise. Most likely F-measure will drop if used together with simple property search.
        """
        o = self.clearLiteral(o)
        if o == "" or o == None:
            return []
        self.dbpediaSparql.setQuery(u"""
            SELECT DISTINCT ?property
            WHERE {
                <%s> ?property ?obj .
                ?obj ?p ?o .
                FILTER regex(?o, "%s", "i")
            }
        """ % (s, o,))
        self.queryDebugMessage("uriLiteralPathRegex", s, o, self.dbpediaSparql.queryString)
        results = self.dbpediaSparql.query().convert()['results']['bindings']
        return self.parseResults(results)

    def literalUriPathRegexReverse(self, s, o):
        o = self.clearLiteral(o)
        if o == "" or o == None:
            return []
        self.dbpediaSparql.setQuery(u"""
            SELECT DISTINCT ?property
            WHERE {
                ?obj ?property <%s> .
                ?obj ?p ?o .
                FILTER regex(?o, "%s", "i")
            }
        """ % (s, o,))
        self.queryDebugMessage("literalUriPathRegexReverse", s, o, self.dbpediaSparql.queryString)
        results = self.dbpediaSparql.query().convert()['results']['bindings']
        return self.parseResults(results)

    def clearLiteral(self, string):
        string = re.sub('[{}|*?()\[\]!-"]', '', string)
        string = re.sub('&nbsp;', '', string)
        string = string.strip()
        return string

    def queryDebugMessage(self, functionName, s, o, queryString):
        self.logger.debug("%s ?s: %s ?o: %s" %(functionName, s, o, ))
        self.logger.debug("SPARQL query: %s" %(queryString, ))

    def parseResults(self, results, variableName="property"):
        properties = []
        for result in results:
            properties.append(result[variableName]['value'])
        return properties
Beispiel #10
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri)
     self.dbpediaSparql.setReturnFormat(JSON)