Esempio n. 1
0
    def predicatesForKeyColumn(self):
        """generate a dictionary containing all predicates for each
        entity in the key-column with their predicates"""
        subjects = []
        for subject in self.columns[self.key]:
            subjCell = sparql.cellContent(subject)
            subjects.append([subjCell, sparql.predicates(subjCell)])

        return subjects
Esempio n. 2
0
    def generate_data_for_columns(self, sub_column_name, obj_column_name):
        """Generate data for a pair of columns.
        Accumulate predicates already present in SPARQL endpoint and select candidates from this set.
        Apply those candidate predicates to all the rows where they were not already present.
        Return a pandas Table with the resulting triples and
            a) the frequency of the applied predicate
            b) the string proximity of the predicate and the object cell's column name
            c) and whether the subject cell is element of a key column."""

        # Row-wise pairs of cells
        cell_pairs = list(zip(self[sub_column_name], self[obj_column_name]))

        # List of existing predicates for every pair of cells
        existing_row_predicates = [
            sparql.predicates(sub, obj) for sub, obj in cell_pairs
        ]

        # Count occurrence of every predicate in every row
        counter = Counter([
            predicate for row in existing_row_predicates for predicate in row
        ])

        candidates = dict()
        for key, value in counter.items():
            candidates[key] = {
                'Frequency': value / len(self),  # a)
                'nameMatch': self.name_match(key, obj_column_name)  # b)
            }

        row_candidates = [
            candidates.keys() - row for row in existing_row_predicates
        ]

        is_key = self.is_key(sub_column_name)  # c)

        # subjectColumn and objectColumn included for later testing
        data = DataFrame(columns=[
            'Subject', 'Predicate', 'Object', 'Frequency', 'isKey',
            'nameMatch', 'subjectColumn', 'objectColumn'
        ])

        for i, (sub, obj) in enumerate(cell_pairs):
            if not sparql.is_resource(sub):  # skip subject literals
                continue

            for predicate in row_candidates[i]:
                data.loc[len(data)] = [
                    sub, predicate, obj, candidates[predicate]['Frequency'],
                    is_key, candidates[predicate]['nameMatch'],
                    self._str_column_name(sub_column_name),
                    self._str_column_name(obj_column_name)
                ]

        return data
Esempio n. 3
0
    def generate_data_for_columns(self, sub_column_name, obj_column_name):
        """Generate data for a pair of columns.
        Accumulate predicates already present in SPARQL endpoint and select candidates from this set.
        Apply those candidate predicates to all the rows where they were not already present.
        Return a pandas Table with the resulting triples and
            a) the frequency of the applied predicate
            b) the string proximity of the predicate and the object cell's column name
            c) and whether the subject cell is element of a key column."""

        # Row-wise pairs of cells
        cell_pairs = list(zip(self[sub_column_name], self[obj_column_name]))

        # List of existing predicates for every pair of cells
        existing_row_predicates = [sparql.predicates(sub, obj) for sub, obj in cell_pairs]

        # Count occurrence of every predicate in every row
        counter = Counter([predicate
                           for row in existing_row_predicates
                           for predicate in row])

        candidates = dict()
        for key, value in counter.items():
            candidates[key] = {
                'Frequency': value / len(self),  # a)
                'nameMatch': self.name_match(key, obj_column_name)  # b)
            }

        row_candidates = [candidates.keys() - row for row in existing_row_predicates]

        is_key = self.is_key(sub_column_name)  # c)

        # subjectColumn and objectColumn included for later testing
        data = DataFrame(columns=['Subject', 'Predicate', 'Object', 'Frequency',
                                  'isKey', 'nameMatch', 'subjectColumn', 'objectColumn'])

        for i, (sub, obj) in enumerate(cell_pairs):
            if not sparql.is_resource(sub):  # skip subject literals
                continue

            for predicate in row_candidates[i]:
                data.loc[len(data)] = [sub, predicate, obj,
                                       candidates[predicate]['Frequency'],
                                       is_key,
                                       candidates[predicate]['nameMatch'],
                                       self._str_column_name(sub_column_name),
                                       self._str_column_name(obj_column_name)]

        return data
Esempio n. 4
0
    def predicates_for_columns(self, sub_column_name, obj_column_name, relative=True):
        """Return all predicates with subColumn's cells as subjects and objColumn's cells as objects.
        Set 'relative' to True if you want relative frequencies."""
        predicates = defaultdict(int)
        for sub, obj in zip(self[sub_column_name], self[obj_column_name]):

            if obj and sparql.is_resource(sub):
                for predicate in sparql.predicates(sub, obj):
                    predicates[predicate] += 1

        if relative:
            for p in predicates:
                predicates[p] = round(
                    predicates[p] / len(self[sub_column_name]), 2)

        return dict(predicates)
Esempio n. 5
0
    def predicates_for_columns(self,
                               sub_column_name,
                               obj_column_name,
                               relative=True):
        """Return all predicates with subColumn's cells as subjects and objColumn's cells as objects.
        Set 'relative' to True if you want relative frequencies."""
        predicates = defaultdict(int)
        for sub, obj in zip(self[sub_column_name], self[obj_column_name]):

            if obj and sparql.is_resource(sub):
                for predicate in sparql.predicates(sub, obj):
                    predicates[predicate] += 1

        if relative:
            for p in predicates:
                predicates[p] = round(
                    predicates[p] / len(self[sub_column_name]), 2)

        return dict(predicates)
Esempio n. 6
0
    def predicatesForColumns(self, subColumn, objColumn, relative=True):
        """Return all predicates with subColumn's cells as subjects and objColumn's cells as objects.
        Set 'relative' to True if you want relative occurances."""
        subData = self.column(subColumn)
        objData = self.column(objColumn)
        predicates = {}
        for i in range(0, len(subData)):
            subContent = sparql.cellContent(subData[i])
            objContent = sparql.cellContent(objData[i])

            if not (objContent and sparql.isResource(subContent)):
                continue

            for predicate in sparql.predicates(subContent, objContent):
                if predicate in predicates:
                    predicates[predicate] += 1
                else:
                    predicates[predicate] = 1

        if relative:
            for p in predicates:
                predicates[p] = round(predicates[p]/len(subData), 2)

        return predicates
Esempio n. 7
0
    def generateRDFs(self, columns=None, threshold=0.0, path=None):
        """Save RDF statements generated from table."""
        data = []
        keyIndex = self.key # Calculate name of key column
        if keyIndex is not None and keyIndex > 0 and keyIndex <= len(self.columnNames):
            keyColumnName = self.columnNames[keyIndex]
        else:
            keyColumnName = None

        for subColumnName, objColumnName in itertools.permutations(columns if columns else self.columnNames, 2):
            subColumn = self.column(subColumnName, content=True)
            objColumn = self.column(objColumnName, content=True)
            if len(subColumn) == 0 or len(objColumn) == 0:
                raise Exception("Table failed because of defective row formattings")

            existingPredicates = [sparql.predicates(subColumn[i], objColumn[i]) for i in range(len(subColumn))]

            absCount = defaultdict(int)
            for row in existingPredicates:
                for predicate in row:
                    absCount[predicate] += 1

            if not absCount:
                continue

            relCount = dict((key, value/len(existingPredicates)) for key, value in absCount.items() if value/len(existingPredicates) > threshold)
            predicates = set(relCount.keys())

            generatedPreciates = [list(predicates - set(row)) for row in existingPredicates]

            for i, row in enumerate(generatedPreciates):
                for predicate in row:
                    # Generating additional infos for analyzing RDFs
                    #objColumnName
                    subIsKey = (keyColumnName == subColumnName)
                    objIsKey = (keyColumnName == objColumnName)
                    rowCount = len(subColumn)
                    data.append([subColumn[i], predicate, objColumn[i], objColumnName, relCount[predicate], subIsKey, objIsKey, rowCount])
            # TODO: Bring back after demo
            # from pandas import DataFrame
            # df = DataFrame(data, columns=['subject', 'predicate', 'object', 'certainty'])
            # df['table'] = repr(self)
            # df['page'] = self.pageTitle

            # print("Generated %d statements with avg. certainty of %.0f%%." % (len(df.index), df['certainty'].mean() * 100))

            if path:
                # df.to_csv(path, index=False)
                pass
            else:
                # return df
                # TODO: Remove after demo
                matrix = []
                for row in data:
                    matrix.append([row[0], '<' + row[1] + '>', row[2], row[3], row[4], row[5], row[6], row[7]])
                s = [[str(e) for e in row] for row in matrix]
                lens = [max(map(len, col)) for col in zip(*s)]
                fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
                table = [fmt.format(*row) for row in s]
                return matrix
                # print('\n'.join(table))
                # return df
        else:
            return []