class MannheimAtomizer(object): """ Atomizes table based on subject column into N tables, where N is (number of columns - 1) """ def __init__(self): self.subjectColumnIdentifier = SimpleIdentifier() self.logger = Logger().getLogger(__name__) def atomizeTable(self, table): try: subjectColumnNumber = self.subjectColumnIdentifier.identifySubjectColumn(table) except SubjectColumnNotFoundError as e: self.logger.error("Subject column not found", exc_info=True) subjectColumnNumber = 0 relations = table.getTable() atomicTables = [] subjectCol = relations[subjectColumnNumber] for index in range(len(relations) - 1): if index != subjectColumnNumber: otherCol = relations[index] atomicTable = numpy.array([subjectCol, otherCol]) atomicTables.append(atomicTable) else: continue if(len(atomicTables) < 1): raise CouldNotAtomizeError("Table could not be atomized!") self.logger.error("Table could not be atomized!") self.logger.error("%s" % (relations,)) return atomicTables
def __init__(self): self.logger = Logger().getLogger(__name__) self.dlIdentifier = DistantSupervisionIdentifier()
def __init__(self): self.subjectColumnIdentifier = SimpleIdentifier() self.logger = Logger().getLogger(__name__)
def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistisIdentifier = AgdistisIdentifier() self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri) self.dbpediaSparql.setReturnFormat(JSON) self.propertySearch = PropertySearchDbpediaSparql()
class SimpleCachePropertyMapper(object): """ Performs quite poorly: The current precision is 47,6% and what can be achieved is 71% (maximum) """ def __init__(self): self.logger = Logger().getLogger(__name__) self.dlIdentifier = DistantSupervisionIdentifier() def mapProperties(self, table): tableData = table.getData() tableHeader = table.getHeader() tableId = table.id cacheFile = os.path.join(cacheFolder, tableId + ".relations.cache") subjectColumn = self.dlIdentifier.identifySubjectColumn(table) self.logger.debug("Identifying properties for a table %s"%(tableId)) if(os.path.exists(cacheFile)): relations = pickle.load(open(cacheFile, 'rb')) else: raise RelationsDataStructureNotFound("Could not found Rels structure for %s"%(str(tableId),)) self.executionTimeFull = 0 self.startTime = time.time() #init properties nonSubjectColumns = range(0,len(relations[0])) nonSubjectColumns.remove(subjectColumn) properties = collections.defaultdict(dict) for nonSubjectColumn in nonSubjectColumns: properties[nonSubjectColumn] = [] #Aggregate all properties for row in relations: for nonSubjectColumn in nonSubjectColumns: #This is properties for atomic table with h_i, i = nonSubjectColumn try: properties[nonSubjectColumn].append(row[subjectColumn][nonSubjectColumn]) except: pass #Flatten the properties topProperties = [] for nonSubjectColumn in nonSubjectColumns: properties[nonSubjectColumn] = [item for sublist in properties[nonSubjectColumn] for item in sublist] #and get the maximum try: topProperty = Counter(properties[nonSubjectColumn]).most_common(1)[0][0] topProperties.append((topProperty,nonSubjectColumn)) except IndexError: self.logger.debug("No property identified for column %s"%(nonSubjectColumn)) self.endTime = time.time() self.executionTimeFull = self.endTime - self.startTime #check if seed properties contain properties we are trying to find self.seedListContains = 0 for _property in table.properties: if _property['uri'] in properties[_property['columnIndex']]: self.seedListContains += 1 return topProperties
class SimplePropertyMapper(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistisIdentifier = AgdistisIdentifier() self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri) self.dbpediaSparql.setReturnFormat(JSON) self.propertySearch = PropertySearchDbpediaSparql() def parseResults(self, results, variableName="property"): """ Refactor in a separate class """ properties = [] for result in results: properties.append(result[variableName]["value"]) return properties def getClassForEntity(self, entity): """ Refactor in a separate class """ self.dbpediaSparql.setQuery( u""" SELECT DISTINCT ?class WHERE { <%s> a ?class . } """ % (entity,) ) results = self.dbpediaSparql.query().convert()["results"]["bindings"] return self.parseResults(results, variableName="class") def getEntities(self, tableId): entitiesCacheFile = os.path.join(cacheFolder, tableId + ".entities.cache") if os.path.exists(entitiesCacheFile): return pickle.load(open(entitiesCacheFile, "rb")) else: raise EntitiesDataStructureNotFound( "Entities data structure not available. Did you run subject column identification?" ) def getEntitiesWithClasses(self, tableId): entities = self.getEntities(tableId) entitiesWithClassesCache = os.path.join(cacheFolder, tableId + ".entities.with.classes.cache") if os.path.exists(entitiesWithClassesCache): entities = pickle.load(open(entitiesWithClassesCache, "rb")) else: for rowIndex, entityRow in enumerate(entities): for columnIndex, entity in enumerate(entityRow): for entityIndex, _entity in enumerate(entity): entity[entityIndex] = (self.getClassForEntity(_entity), _entity) pickle.dump(entities, open(entitiesWithClassesCache, "wb")) return entities def getClasses(self, entities, numberOfColumns): classes = [[]] * numberOfColumns for rowIndex, entityRow in enumerate(entities): for columnIndex, entity in enumerate(entityRow): for entityIndex, _entity in enumerate(entity): (_class, entityUrl) = _entity try: classes[columnIndex].append(_class) except BaseException as e: print "%s" % (str(e),) return classes def getMainClassForSubjectColumn(self, classes, subjectColumn): classesSubjectColumn = [item for sublist in classes[subjectColumn] for item in sublist] try: classCount = len(classesSubjectColumn) (mainClass, mainClassCount) = Counter(classesSubjectColumn).most_common(1)[0] mainClassScore = float(mainClassCount) / classCount * 100 except IndexError: self.logger.debug("Main class could not be identified") mainClass = "" return mainClass def filterNonMainClassEntities(self, entities, mainClass, subjectColumn): for rowIndex, entityRow in enumerate(entities): for columnIndex, entity in enumerate(entityRow): if columnIndex != subjectColumn: continue for entityIndex, _entity in enumerate(entity): (_class, entityUrl) = _entity if not mainClass in _class: entities[rowIndex][columnIndex][entityIndex] = (None, None) return entities def findProperties(self, tableId, tableData, entities, subjectColumn, nonSubjectColumns): propertyCache = os.path.join(cacheFolder, tableId + ".property.star.cache") properties = collections.defaultdict(dict) if os.path.exists(propertyCache): properties = pickle.load(open(propertyCache, "rb")) else: for rowIndex, entityRow in enumerate(entities): for columnIndex, entity in enumerate(entityRow): if columnIndex != subjectColumn: continue if len(entity) <= 0: continue for entityIndex, _entity in enumerate(entity): (_class, entityUrl) = _entity if entityUrl != None: for nonSubjectColumn in nonSubjectColumns: cellValue = tableData[rowIndex][nonSubjectColumn] properties[rowIndex][nonSubjectColumn] = self.propertySearch.uriLiteralSearch( entityUrl, cellValue ) pickle.dump(properties, open(propertyCache, "wb")) return properties def aggregateProperties(self, properties, nonSubjectColumns): propertiesAggregate = collections.defaultdict(dict) for nonSubjectColumn in nonSubjectColumns: propertiesAggregate[nonSubjectColumn] = [] for row in properties: for nonSubjectColumn in nonSubjectColumns: propertiesAggregate[nonSubjectColumn].append(properties[row][nonSubjectColumn]) for nonSubjectColumn in nonSubjectColumns: propertiesAggregate[nonSubjectColumn] = [ item for sublist in propertiesAggregate[nonSubjectColumn] for item in sublist ] return propertiesAggregate def getTopProperties(self, propertiesAggregate, nonSubjectColumns, threshold): topProperties = [] for nonSubjectColumn in nonSubjectColumns: try: (topProperty, support) = Counter(propertiesAggregate[nonSubjectColumn]).most_common(1)[0] # In percents support = (float(support) / len(propertiesAggregate[nonSubjectColumn])) * 100 if support > threshold: topProperties.append({"uri": topProperty, "columnIndex": nonSubjectColumn}) except IndexError: self.logger.debug("No property identified for column %s" % (nonSubjectColumn)) return topProperties def calculateScores(self, propertiesAggregate, nonSubjectColumns): scores = collections.defaultdict(dict) for nonSubjectColumn in nonSubjectColumns: scores[nonSubjectColumn] = [] for nonSubjectColumn in nonSubjectColumns: scores[nonSubjectColumn] = Counter(propertiesAggregate[nonSubjectColumn]) return scores def getScores(self, table, rowsToDisambiguate=20, threshold=10, support=0, connectivity=0): tableData = table.getData() tableHeader = table.getHeader() tableId = table.id numberOfRows = len(tableData) numberOfColumns = len(tableData[0]) subjectColumn = table.subjectColumn if subjectColumn == None or subjectColumn == -1: return [] nonSubjectColumns = range(0, len(tableData[0])) nonSubjectColumns.remove(subjectColumn) self.logger.debug("Identifying properties for a table %s" % (tableId)) entities = self.getEntitiesWithClasses(tableId) classes = self.getClasses(entities, numberOfColumns) mainClass = self.getMainClassForSubjectColumn(classes, subjectColumn) entities = self.filterNonMainClassEntities(entities, mainClass, subjectColumn) properties = self.findProperties(tableId, tableData, entities, subjectColumn, nonSubjectColumns) propertiesAggregate = self.aggregateProperties(properties, nonSubjectColumns) propertyScores = self.calculateScores(propertiesAggregate, nonSubjectColumns) return propertyScores def mapProperties(self, table, rowsToDisambiguate=20, threshold=10, support=0, connectivity=0): tableData = table.getData() tableHeader = table.getHeader() tableId = table.id numberOfRows = len(tableData) numberOfColumns = len(tableData[0]) subjectColumn = table.subjectColumn if subjectColumn == None or subjectColumn == -1: return [] nonSubjectColumns = range(0, len(tableData[0])) nonSubjectColumns.remove(subjectColumn) self.logger.debug("Identifying properties for a table %s" % (tableId)) entities = self.getEntitiesWithClasses(tableId) classes = self.getClasses(entities, numberOfColumns) mainClass = self.getMainClassForSubjectColumn(classes, subjectColumn) entities = self.filterNonMainClassEntities(entities, mainClass, subjectColumn) properties = self.findProperties(tableId, tableData, entities, subjectColumn, nonSubjectColumns) propertiesAggregate = self.aggregateProperties(properties, nonSubjectColumns) propertyScores = self.calculateScores(propertiesAggregate, nonSubjectColumns) topProperties = self.getTopProperties(propertiesAggregate, nonSubjectColumns, threshold) return topProperties
def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.supportIdentifier = SupportIdentifier()
class DistantSupervisionIdentifier(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.supportIdentifier = SupportIdentifier() def identifySubjectColumn( self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0 ): """ rowsToAnalyze -- how many rows should be evaluated rowsFromCache -- can be used to reduce number of rows to be read from cache connectivity -- a number of relations subject column should have at least (absolute number) threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows """ tableData = table.getData() tableHeader = table.getHeader() tableId = table.id numberOfRows = len(tableData) numberOfColumns = len(tableData[0]) self.logger.debug(tableId) self.executionStartTimePoint = 0 self.executionEndTimePoint = 0 self.executionTimeFull = 0 self.executionTimePure = 0 # without querying and disambiguation self.queryTime = 0 self.agdistisTime = 0 self.executionStartTimePoint = time.time() # identify entities # TODO: get the score from agdistis agdistisStartTimePoint = time.time() entities = self.agdistis.disambiguateTable(table) agdistisEndTimePoint = time.time() self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint # TODO: rename columnScores to supports columnScores = self.supportIdentifier.calculateSupport(entities) # Support based approach ends here: refactor into class relations = self.propertyTableSearch.findRelationsForTable(table, entities) # Make just a connectivity approach!!! # Calculate the connectivity for all the rows and then take average! # What we have a boolean classifier # Linear combination is better # Ten cross fold validation (or inverse) # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support # For the combination --> import ipdb ipdb.set_trace() subjectColumnScores = [0] * numberOfColumns for subjectColumn in subjectColumns: if subjectColumn != None: subjectColumnScores[subjectColumn] += 1 # Normalize for columnIndex, subjectColumnScore in enumerate(subjectColumnScores): subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100 import ipdb ipdb.set_trace() # WRONG!!!! # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold] self.executionEndTimePoint = time.time() self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime if len(subjectColumn) <= 0: return None else: return subjectColumn[0]
class PropertySearchDbpediaSparql(object): """ This class takes two entities (URI and URI/Literal) And returns all properties which (possibly) connects them """ def __init__(self): self.logger = Logger().getLogger(__name__) self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri) self.dbpediaSparql.setReturnFormat(JSON) def search(self, s, o): if(o.startswith('http')): return self.uriUriSimple(s, o) else: return self.uriLiteralSimple(s, o) def uriLiteralSearch(self, s, o): properties = [] try: o = o.decode('utf-8') except UnicodeDecodeError as e: self.logger.debug("Could not decode o for uriLiteralSearch") self.logger.debug(str(e)) o = "" properties.append(self.uriLiteralSimple(s,o)) properties.append(self.uriLiteralRegex(s,o)) properties.append(self.uriLiteralRegexReverse(s,o)) properties = [item for sublist in properties for item in sublist] return list(set(properties)) def uriUriSearch(self, s, o): properties = [] properties.append(self.uriUriSimple(s,o)) return properties def uriUriSimple(self, s, o): self.dbpediaSparql.setQuery(u""" SELECT DISTINCT ?property WHERE { <%s> ?property <%s> .} """ % (s, o,)) self.queryDebugMessage("uriUriSimple", s, o, self.dbpediaSparql.queryString) results = self.dbpediaSparql.query().convert()['results']['bindings'] return self.parseResults(results) def uriLiteralSimple(self, s, o): o = self.clearLiteral(o) if o == "" or o == None: return [] self.dbpediaSparql.setQuery(u""" SELECT DISTINCT ?property WHERE { <%s> ?property "%s"@en .} """ % (s, o,)) self.queryDebugMessage("uriLiteralSimple", s, o, self.dbpediaSparql.queryString) results = self.dbpediaSparql.query().convert()['results']['bindings'] return self.parseResults(results) def uriLiteralRegex(self, s, o): o = self.clearLiteral(o) if o == "" or o == None: return [] self.dbpediaSparql.setQuery(u""" SELECT DISTINCT ?property WHERE { <%s> ?property ?o . FILTER regex(?o, ".*%s.*", "i") } """ % (s, o,)) self.queryDebugMessage("uriLiteralRegex", s, o, self.dbpediaSparql.queryString) results = self.dbpediaSparql.query().convert()['results']['bindings'] return self.parseResults(results) def uriLiteralRegexReverse(self, s, o): o = self.clearLiteral(o) if o == "" or o == None: return [] self.dbpediaSparql.setQuery(u""" SELECT DISTINCT ?property WHERE { ?o ?property <%s> . FILTER regex(?o, ".*%s.*", "i") } """ % (s, o,)) self.queryDebugMessage("uriLiteralRegexReverse", s, o, self.dbpediaSparql.queryString) results = self.dbpediaSparql.query().convert()['results']['bindings'] return self.parseResults(results) def uriLiteralPathRegex(self, s, o): """ Due to small diameter of a graph looking for any pathes will lead to noise. Most likely F-measure will drop if used together with simple property search. """ o = self.clearLiteral(o) if o == "" or o == None: return [] self.dbpediaSparql.setQuery(u""" SELECT DISTINCT ?property WHERE { <%s> ?property ?obj . ?obj ?p ?o . FILTER regex(?o, "%s", "i") } """ % (s, o,)) self.queryDebugMessage("uriLiteralPathRegex", s, o, self.dbpediaSparql.queryString) results = self.dbpediaSparql.query().convert()['results']['bindings'] return self.parseResults(results) def literalUriPathRegexReverse(self, s, o): o = self.clearLiteral(o) if o == "" or o == None: return [] self.dbpediaSparql.setQuery(u""" SELECT DISTINCT ?property WHERE { ?obj ?property <%s> . ?obj ?p ?o . FILTER regex(?o, "%s", "i") } """ % (s, o,)) self.queryDebugMessage("literalUriPathRegexReverse", s, o, self.dbpediaSparql.queryString) results = self.dbpediaSparql.query().convert()['results']['bindings'] return self.parseResults(results) def clearLiteral(self, string): string = re.sub('[{}|*?()\[\]!-"]', '', string) string = re.sub(' ', '', string) string = string.strip() return string def queryDebugMessage(self, functionName, s, o, queryString): self.logger.debug("%s ?s: %s ?o: %s" %(functionName, s, o, )) self.logger.debug("SPARQL query: %s" %(queryString, )) def parseResults(self, results, variableName="property"): properties = [] for result in results: properties.append(result[variableName]['value']) return properties
def __init__(self): self.logger = Logger().getLogger(__name__) self.dbpediaSparql = SPARQLWrapper(dbpediaSparqlEndpointUri) self.dbpediaSparql.setReturnFormat(JSON)