Ejemplo n.º 1
0
    def _estimate_cardinality(self, subject, predicate, obj):
        """
            Estimate the cardinality of a triple pattern using PostgreSQL histograms.
            Args:
                - subject ``string`` - Subject of the triple pattern
                - predicate ``string`` - Predicate of the triple pattern
                - obj ``string`` - Object of the triple pattern
            Returns:
                The estimated cardinality of the triple pattern
        """
        subject = subject if (subject is not None) and (
            not is_variable(subject)) else None
        predicate = predicate if (predicate is not None) and (
            not is_variable(predicate)) else None
        obj = obj if (obj is not None) and (not is_variable(obj)) else None
        # try to encode predicate if needed
        # if predicate is not None:
        #     predicate = predicate_to_id(predicate)

        # estimate the selectivity of the triple pattern using PostgreSQL histograms
        selectivity = 1
        # avoid division per zero when some histograms are not fully up-to-date
        try:
            # compute the selectivity of a bounded subject
            if subject is not None:
                if subject in self._subject_histograms['selectivities']:
                    selectivity *= self._subject_histograms['selectivities'][
                        subject]
                else:
                    selectivity *= (
                        1 - self._subject_histograms['sum_freqs']) / (
                            self._subject_histograms['n_distinct'] -
                            len(self._subject_histograms['selectivities']))
            # compute the selectivity of a bounded predicate
            if predicate is not None:
                if predicate in self._predicate_histograms['selectivities']:
                    selectivity *= self._predicate_histograms['selectivities'][
                        predicate]
                else:
                    selectivity *= (
                        1 - self._predicate_histograms['sum_freqs']) / (
                            self._predicate_histograms['n_distinct'] -
                            len(self._predicate_histograms['selectivities']))
            # compute the selectivity of a bounded object
            if obj is not None:
                if obj in self._object_histograms['selectivities']:
                    selectivity *= self._object_histograms['selectivities'][
                        obj]
                else:
                    selectivity *= (
                        1 - self._object_histograms['sum_freqs']) / (
                            self._object_histograms['n_distinct'] -
                            len(self._object_histograms['selectivities']))
        except ZeroDivisionError:
            pass
        # estimate the cardinality from the estimated selectivity
        cardinality = int(ceil(selectivity * self._avg_row_count))
        return cardinality if cardinality > 0 else 1
Ejemplo n.º 2
0
    def search(self, subject, predicate, obj, last_read=None, as_of=None):
        """
            Get an iterator over all RDF triples matching a triple pattern.
            Args:
                - subject ``string`` - Subject of the triple pattern
                - predicate ``string`` - Predicate of the triple pattern
                - obj ``string`` - Object of the triple pattern
                - last_read ``string=None`` ``optional`` -  OFFSET ID used to resume scan
                - as_of ``datetime=None`` ``optional`` - Perform all reads against a consistent snapshot represented by a timestamp.
            Returns:
                A tuple (`iterator`, `cardinality`), where `iterator` is a Python iterator over RDF triples matching the given triples pattern, and `cardinality` is the estimated cardinality of the triple pattern
        """
        # do warmup if necessary
        self.open()

        # format triple patterns for the PostgreSQL API
        subject = subject if (subject is not None) and (
            not is_variable(subject)) else None
        predicate = predicate if (predicate is not None) and (
            not is_variable(predicate)) else None
        obj = obj if (obj is not None) and (not is_variable(obj)) else None
        pattern = {'subject': subject, 'predicate': predicate, 'object': obj}
        # try to encode predicate (if needed)
        # if predicate is not None:
        #     predicate = predicate_to_id(predicate)

        # dedicated cursor used to scan this triple pattern
        # WARNING: we need to use a dedicated cursor per triple pattern iterator.
        # Otherwise, we might reset a cursor whose results were not fully consumed.
        cursor = self._manager.get_connection().cursor(str(uuid.uuid4()))

        # create a SQL query to start a new index scan
        if last_read is None:
            start_query, start_params = get_start_query(
                subject, predicate, obj, self._table_name)
        else:
            # empty last_read key => the scan has already been completed
            if len(last_read) == 0:
                return EmptyIterator(pattern), 0
            # otherwise, create a SQL query to resume the index scan
            last_read = json.loads(last_read)
            t = (last_read["s"], last_read["p"], last_read["o"])
            start_query, start_params = get_resume_query(
                subject, predicate, obj, t, self._table_name)

        # create the iterator to yield the matching RDF triples
        iterator = PostgresIterator(cursor,
                                    self._manager.get_connection(),
                                    start_query,
                                    start_params,
                                    self._table_name,
                                    pattern,
                                    fetch_size=self._fetch_size)
        card = self._estimate_cardinality(subject, predicate,
                                          obj) if iterator.has_next() else 0
        return iterator, card
Ejemplo n.º 3
0
 def _estimate_cardinality(self, subject, predicate, obj):
     """
         Estimate the cardinality of a triple pattern using PostgreSQL histograms.
         Args:
             - subject ``string`` - Subject of the triple pattern
             - predicate ``string`` - Predicate of the triple pattern
             - obj ``string`` - Object of the triple pattern
         Returns:
             The estimated cardinality of the triple pattern
     """
     # format triple patterns for the SQlite API
     s = int(subject.split('_')[1]) if (subject is not None) and (
         not is_variable(subject)) else None
     p = int(predicate.split('_')[1]) if (predicate is not None) and (
         not is_variable(predicate)) else None
     o = int(obj.split('_')[1]) if (obj is not None) and (
         not is_variable(obj)) else None
     # estimate triple cardinality using sqlite statistics (more or less a variable counting join ordering)
     kind = get_kind(s, p, o)
     if kind == 'spo':
         return self._spo_index_stats['same_spo_row_count']
     elif kind == '???':
         return self._spo_index_stats['row_count']
     elif kind == 's??':
         return self._spo_index_stats['same_s_row_count']
     elif kind == 'sp?':
         return self._spo_index_stats['same_sp_row_count']
     elif kind == '?p?':
         return self._pos_index_stats['same_p_row_count']
     elif kind == '?po':
         return self._pos_index_stats['same_po_row_count']
     elif kind == 's?o':
         return self._osp_index_stats['same_os_row_count']
     elif kind == '??o':
         return self._osp_index_stats['same_o_row_count']
     else:
         raise Exception("Unkown pattern type: {}".format(kind))
Ejemplo n.º 4
0
def vars_positions(subject, predicate, obj):
    """Find position of SPARQL variables in a triple pattern"""
    return [
        var if is_variable(var) else None for var in [subject, predicate, obj]
    ]