Esempio n. 1
0
class SupportIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistis = AgdistisTableIdentifier()

    def calculateSupport(self, entities):
        """
            support -- percentage of entities to occur in a column to be considered a candidate for a subject column (columns without entities are not subject column per definition)
        """
        numberOfColumns = len(entities[0])
        numberOfRows = len(entities)
        supports = [0]*numberOfColumns
        for rowIndex, entityRow in enumerate(entities):
            for columnIndex, entity in enumerate(entityRow):
                if(len(entity) > 0):
                    supports[columnIndex] += 1

        for columnIndex, columnScore in enumerate(supports):
            supports[columnIndex] = float(columnScore) / numberOfRows * 100

        return supports

    def identifySubjectColumn(self, table, supportCeil, supportFloor):
        supports = self.getSupport(table)
        #Return column with maximum support
        supports = [support if support < supportCeil and support > supportFloor else 0 for support in supports]
        return supports.index(max(supports))

    def getSupport(self, table):
        entities = self.agdistis.disambiguateTable(table)
        return self.calculateSupport(entities)
Esempio n. 2
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.agdistis = AgdistisTableIdentifier()
Esempio n. 3
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.agdistis = AgdistisTableIdentifier()
     self.propertyTableSearch = PropertyTableSearch()
class DistantSupervisionIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistis = AgdistisTableIdentifier()
        self.supportIdentifier = SupportIdentifier()

    def identifySubjectColumn(
        self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0
    ):
        """
            rowsToAnalyze -- how many rows should be evaluated
            rowsFromCache -- can be used to reduce number of rows to be read from cache
            connectivity -- a number of relations subject column should have at least (absolute number)
            threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows
        """
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])

        self.logger.debug(tableId)

        self.executionStartTimePoint = 0
        self.executionEndTimePoint = 0
        self.executionTimeFull = 0
        self.executionTimePure = 0  # without querying and disambiguation
        self.queryTime = 0
        self.agdistisTime = 0

        self.executionStartTimePoint = time.time()
        # identify entities
        # TODO: get the score from agdistis
        agdistisStartTimePoint = time.time()
        entities = self.agdistis.disambiguateTable(table)
        agdistisEndTimePoint = time.time()
        self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint

        # TODO: rename columnScores to supports
        columnScores = self.supportIdentifier.calculateSupport(entities)
        # Support based approach ends here: refactor into class
        relations = self.propertyTableSearch.findRelationsForTable(table, entities)

        # Make just a connectivity approach!!!

        # Calculate the connectivity for all the rows and then take average!
        # What we have a boolean classifier
        # Linear combination is better
        # Ten cross fold validation (or inverse)
        # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support
        # For the combination -->

        import ipdb

        ipdb.set_trace()

        subjectColumnScores = [0] * numberOfColumns
        for subjectColumn in subjectColumns:
            if subjectColumn != None:
                subjectColumnScores[subjectColumn] += 1

        # Normalize
        for columnIndex, subjectColumnScore in enumerate(subjectColumnScores):
            subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100

        import ipdb

        ipdb.set_trace()
        # WRONG!!!!
        # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold]

        self.executionEndTimePoint = time.time()
        self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint
        self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime

        if len(subjectColumn) <= 0:
            return None
        else:
            return subjectColumn[0]
Esempio n. 5
0
class ConnectivityIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistis = AgdistisTableIdentifier()
        self.propertyTableSearch = PropertyTableSearch()

    def calculateConnectivity(self, relations, applyWeights):
        numberOfColumns = len(relations[0])
        numberOfRows = len(relations)

        # If connected, then 1/numberOfColumns, otherwise 0
        # Should be 1 if connected to all other columns
        connectivity = [0] * numberOfColumns
        # count all relations
        for rowIndex, relation in enumerate(relations):
            _weights = [0] * numberOfColumns
            _connectivity = [0] * numberOfColumns
            for columnIndex in relation:
                score = 0
                weight = 0
                for otherColumnIndex in relation[columnIndex]:
                    if len(relation[columnIndex][otherColumnIndex]) > 0:
                        score += 1
                    weight += len(relation[columnIndex][otherColumnIndex])
                score = float(score) / numberOfColumns
                _connectivity[columnIndex] += score
                _weights[columnIndex] += weight

            # Apply weights
            maximumWeight = max(_weights)
            for columnIndex, w in enumerate(_weights):
                if applyWeights:
                    if maximumWeight == 0:
                        connectivity[columnIndex] += _connectivity[columnIndex]
                    else:
                        connectivity[columnIndex] += _connectivity[columnIndex] * (float(w) / maximumWeight)
                else:
                    connectivity[columnIndex] += _connectivity[columnIndex]

        # Normalize by number of rows
        for columnIndex, _connectivity in enumerate(connectivity):
            connectivity[columnIndex] = float(_connectivity) / numberOfRows

        return connectivity

    def identifySubjectColumn(self, table, applyWeights=False, connectivityFloor=0, connectivityCeil=100):
        connectivity = self.getConnectivity(table, applyWeights)
        connectivity = [
            _connectivity if _connectivity > connectivityFloor and _connectivity < connectivityCeil else 0
            for _connectivity in connectivity
        ]
        # Return column with maximum support
        if max(connectivity) == 0:
            return -1
        else:
            return connectivity.index(max(connectivity))

    def getConnectivity(self, table, applyWeights=False):
        entities = self.agdistis.disambiguateTable(table)
        relations = self.propertyTableSearch.findRelationsForTable(table, entities)
        return self.calculateConnectivity(relations, applyWeights)