def predicatesForKeyColumn(self): """generate a dictionary containing all predicates for each entity in the key-column with their predicates""" subjects = [] for subject in self.columns[self.key]: subjCell = sparql.cellContent(subject) subjects.append([subjCell, sparql.predicates(subjCell)]) return subjects
def generate_data_for_columns(self, sub_column_name, obj_column_name): """Generate data for a pair of columns. Accumulate predicates already present in SPARQL endpoint and select candidates from this set. Apply those candidate predicates to all the rows where they were not already present. Return a pandas Table with the resulting triples and a) the frequency of the applied predicate b) the string proximity of the predicate and the object cell's column name c) and whether the subject cell is element of a key column.""" # Row-wise pairs of cells cell_pairs = list(zip(self[sub_column_name], self[obj_column_name])) # List of existing predicates for every pair of cells existing_row_predicates = [ sparql.predicates(sub, obj) for sub, obj in cell_pairs ] # Count occurrence of every predicate in every row counter = Counter([ predicate for row in existing_row_predicates for predicate in row ]) candidates = dict() for key, value in counter.items(): candidates[key] = { 'Frequency': value / len(self), # a) 'nameMatch': self.name_match(key, obj_column_name) # b) } row_candidates = [ candidates.keys() - row for row in existing_row_predicates ] is_key = self.is_key(sub_column_name) # c) # subjectColumn and objectColumn included for later testing data = DataFrame(columns=[ 'Subject', 'Predicate', 'Object', 'Frequency', 'isKey', 'nameMatch', 'subjectColumn', 'objectColumn' ]) for i, (sub, obj) in enumerate(cell_pairs): if not sparql.is_resource(sub): # skip subject literals continue for predicate in row_candidates[i]: data.loc[len(data)] = [ sub, predicate, obj, candidates[predicate]['Frequency'], is_key, candidates[predicate]['nameMatch'], self._str_column_name(sub_column_name), self._str_column_name(obj_column_name) ] return data
def generate_data_for_columns(self, sub_column_name, obj_column_name): """Generate data for a pair of columns. Accumulate predicates already present in SPARQL endpoint and select candidates from this set. Apply those candidate predicates to all the rows where they were not already present. Return a pandas Table with the resulting triples and a) the frequency of the applied predicate b) the string proximity of the predicate and the object cell's column name c) and whether the subject cell is element of a key column.""" # Row-wise pairs of cells cell_pairs = list(zip(self[sub_column_name], self[obj_column_name])) # List of existing predicates for every pair of cells existing_row_predicates = [sparql.predicates(sub, obj) for sub, obj in cell_pairs] # Count occurrence of every predicate in every row counter = Counter([predicate for row in existing_row_predicates for predicate in row]) candidates = dict() for key, value in counter.items(): candidates[key] = { 'Frequency': value / len(self), # a) 'nameMatch': self.name_match(key, obj_column_name) # b) } row_candidates = [candidates.keys() - row for row in existing_row_predicates] is_key = self.is_key(sub_column_name) # c) # subjectColumn and objectColumn included for later testing data = DataFrame(columns=['Subject', 'Predicate', 'Object', 'Frequency', 'isKey', 'nameMatch', 'subjectColumn', 'objectColumn']) for i, (sub, obj) in enumerate(cell_pairs): if not sparql.is_resource(sub): # skip subject literals continue for predicate in row_candidates[i]: data.loc[len(data)] = [sub, predicate, obj, candidates[predicate]['Frequency'], is_key, candidates[predicate]['nameMatch'], self._str_column_name(sub_column_name), self._str_column_name(obj_column_name)] return data
def predicates_for_columns(self, sub_column_name, obj_column_name, relative=True): """Return all predicates with subColumn's cells as subjects and objColumn's cells as objects. Set 'relative' to True if you want relative frequencies.""" predicates = defaultdict(int) for sub, obj in zip(self[sub_column_name], self[obj_column_name]): if obj and sparql.is_resource(sub): for predicate in sparql.predicates(sub, obj): predicates[predicate] += 1 if relative: for p in predicates: predicates[p] = round( predicates[p] / len(self[sub_column_name]), 2) return dict(predicates)
def predicatesForColumns(self, subColumn, objColumn, relative=True): """Return all predicates with subColumn's cells as subjects and objColumn's cells as objects. Set 'relative' to True if you want relative occurances.""" subData = self.column(subColumn) objData = self.column(objColumn) predicates = {} for i in range(0, len(subData)): subContent = sparql.cellContent(subData[i]) objContent = sparql.cellContent(objData[i]) if not (objContent and sparql.isResource(subContent)): continue for predicate in sparql.predicates(subContent, objContent): if predicate in predicates: predicates[predicate] += 1 else: predicates[predicate] = 1 if relative: for p in predicates: predicates[p] = round(predicates[p]/len(subData), 2) return predicates
def generateRDFs(self, columns=None, threshold=0.0, path=None): """Save RDF statements generated from table.""" data = [] keyIndex = self.key # Calculate name of key column if keyIndex is not None and keyIndex > 0 and keyIndex <= len(self.columnNames): keyColumnName = self.columnNames[keyIndex] else: keyColumnName = None for subColumnName, objColumnName in itertools.permutations(columns if columns else self.columnNames, 2): subColumn = self.column(subColumnName, content=True) objColumn = self.column(objColumnName, content=True) if len(subColumn) == 0 or len(objColumn) == 0: raise Exception("Table failed because of defective row formattings") existingPredicates = [sparql.predicates(subColumn[i], objColumn[i]) for i in range(len(subColumn))] absCount = defaultdict(int) for row in existingPredicates: for predicate in row: absCount[predicate] += 1 if not absCount: continue relCount = dict((key, value/len(existingPredicates)) for key, value in absCount.items() if value/len(existingPredicates) > threshold) predicates = set(relCount.keys()) generatedPreciates = [list(predicates - set(row)) for row in existingPredicates] for i, row in enumerate(generatedPreciates): for predicate in row: # Generating additional infos for analyzing RDFs #objColumnName subIsKey = (keyColumnName == subColumnName) objIsKey = (keyColumnName == objColumnName) rowCount = len(subColumn) data.append([subColumn[i], predicate, objColumn[i], objColumnName, relCount[predicate], subIsKey, objIsKey, rowCount]) # TODO: Bring back after demo # from pandas import DataFrame # df = DataFrame(data, columns=['subject', 'predicate', 'object', 'certainty']) # df['table'] = repr(self) # df['page'] = self.pageTitle # print("Generated %d statements with avg. certainty of %.0f%%." % (len(df.index), df['certainty'].mean() * 100)) if path: # df.to_csv(path, index=False) pass else: # return df # TODO: Remove after demo matrix = [] for row in data: matrix.append([row[0], '<' + row[1] + '>', row[2], row[3], row[4], row[5], row[6], row[7]]) s = [[str(e) for e in row] for row in matrix] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] return matrix # print('\n'.join(table)) # return df else: return []