class SupportIdentifier(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() def calculateSupport(self, entities): """ support -- percentage of entities to occur in a column to be considered a candidate for a subject column (columns without entities are not subject column per definition) """ numberOfColumns = len(entities[0]) numberOfRows = len(entities) supports = [0]*numberOfColumns for rowIndex, entityRow in enumerate(entities): for columnIndex, entity in enumerate(entityRow): if(len(entity) > 0): supports[columnIndex] += 1 for columnIndex, columnScore in enumerate(supports): supports[columnIndex] = float(columnScore) / numberOfRows * 100 return supports def identifySubjectColumn(self, table, supportCeil, supportFloor): supports = self.getSupport(table) #Return column with maximum support supports = [support if support < supportCeil and support > supportFloor else 0 for support in supports] return supports.index(max(supports)) def getSupport(self, table): entities = self.agdistis.disambiguateTable(table) return self.calculateSupport(entities)
def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier()
def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.propertyTableSearch = PropertyTableSearch()
class DistantSupervisionIdentifier(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.supportIdentifier = SupportIdentifier() def identifySubjectColumn( self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0 ): """ rowsToAnalyze -- how many rows should be evaluated rowsFromCache -- can be used to reduce number of rows to be read from cache connectivity -- a number of relations subject column should have at least (absolute number) threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows """ tableData = table.getData() tableHeader = table.getHeader() tableId = table.id numberOfRows = len(tableData) numberOfColumns = len(tableData[0]) self.logger.debug(tableId) self.executionStartTimePoint = 0 self.executionEndTimePoint = 0 self.executionTimeFull = 0 self.executionTimePure = 0 # without querying and disambiguation self.queryTime = 0 self.agdistisTime = 0 self.executionStartTimePoint = time.time() # identify entities # TODO: get the score from agdistis agdistisStartTimePoint = time.time() entities = self.agdistis.disambiguateTable(table) agdistisEndTimePoint = time.time() self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint # TODO: rename columnScores to supports columnScores = self.supportIdentifier.calculateSupport(entities) # Support based approach ends here: refactor into class relations = self.propertyTableSearch.findRelationsForTable(table, entities) # Make just a connectivity approach!!! # Calculate the connectivity for all the rows and then take average! # What we have a boolean classifier # Linear combination is better # Ten cross fold validation (or inverse) # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support # For the combination --> import ipdb ipdb.set_trace() subjectColumnScores = [0] * numberOfColumns for subjectColumn in subjectColumns: if subjectColumn != None: subjectColumnScores[subjectColumn] += 1 # Normalize for columnIndex, subjectColumnScore in enumerate(subjectColumnScores): subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100 import ipdb ipdb.set_trace() # WRONG!!!! # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold] self.executionEndTimePoint = time.time() self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime if len(subjectColumn) <= 0: return None else: return subjectColumn[0]
class ConnectivityIdentifier(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.propertyTableSearch = PropertyTableSearch() def calculateConnectivity(self, relations, applyWeights): numberOfColumns = len(relations[0]) numberOfRows = len(relations) # If connected, then 1/numberOfColumns, otherwise 0 # Should be 1 if connected to all other columns connectivity = [0] * numberOfColumns # count all relations for rowIndex, relation in enumerate(relations): _weights = [0] * numberOfColumns _connectivity = [0] * numberOfColumns for columnIndex in relation: score = 0 weight = 0 for otherColumnIndex in relation[columnIndex]: if len(relation[columnIndex][otherColumnIndex]) > 0: score += 1 weight += len(relation[columnIndex][otherColumnIndex]) score = float(score) / numberOfColumns _connectivity[columnIndex] += score _weights[columnIndex] += weight # Apply weights maximumWeight = max(_weights) for columnIndex, w in enumerate(_weights): if applyWeights: if maximumWeight == 0: connectivity[columnIndex] += _connectivity[columnIndex] else: connectivity[columnIndex] += _connectivity[columnIndex] * (float(w) / maximumWeight) else: connectivity[columnIndex] += _connectivity[columnIndex] # Normalize by number of rows for columnIndex, _connectivity in enumerate(connectivity): connectivity[columnIndex] = float(_connectivity) / numberOfRows return connectivity def identifySubjectColumn(self, table, applyWeights=False, connectivityFloor=0, connectivityCeil=100): connectivity = self.getConnectivity(table, applyWeights) connectivity = [ _connectivity if _connectivity > connectivityFloor and _connectivity < connectivityCeil else 0 for _connectivity in connectivity ] # Return column with maximum support if max(connectivity) == 0: return -1 else: return connectivity.index(max(connectivity)) def getConnectivity(self, table, applyWeights=False): entities = self.agdistis.disambiguateTable(table) relations = self.propertyTableSearch.findRelationsForTable(table, entities) return self.calculateConnectivity(relations, applyWeights)