Beispiel #1
0
    def _compute(self, pairs, x, x_link=None):

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        # index_time = time.time() - start_time

        features = []

        for feat in self.features:

            lbl1 = feat.labels_left
            lbl2 = feat.labels_right

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            result = feat._compute(data1, data2)
            features.append((result, feat.label))

        features = self._union(features, pairs)

        # log timing
        n = pairs.shape[0]
        i_max = '?' if self._i_max is None else self._i_max
        eta = time.time() - start_time
        self._eta.append(eta)
        self._n.append(n)

        # log
        logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format(
            self._i, i_max, eta, n))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            eta_total = np.sum(self._eta)

            logging.info(
                "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format(
                    self._i, i_max, eta_total, n_total))

        self._i += 1

        return features
Beispiel #2
0
    def compute(self, pairs, x, x_link=None):
        """Compare the records of each record pair.

        Calling this method starts the comparing of records.

        Parameters
        ----------
        pairs : pandas.MultiIndex
            A pandas MultiIndex with the record pairs to compare. The indices
            in the MultiIndex are indices of the DataFrame(s) to link.
        x : pandas.DataFrame
            The DataFrame to link. If `x_link` is given, the comparing is a
            linking problem. If `x_link` is not given, the problem is one of
            deduplication.
        x_link : pandas.DataFrame, optional
            The second DataFrame.

        Returns
        -------
        pandas.Series, pandas.DataFrame, numpy.ndarray
            The result of comparing record pairs (the features). Can be
            a tuple with multiple pandas.Series, pandas.DataFrame,
            numpy.ndarray objects.
        """

        if not is_pandas_2d_multiindex(pairs):
            raise ValueError(
                "expected pandas.MultiIndex with record pair indices "
                "as first argument"
            )

        if not isinstance(x, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as second argument")

        if x_link is not None and not isinstance(x_link, pandas.DataFrame):
            raise ValueError("expected pandas.DataFrame as third argument")

        labels_left = listify(self.labels_left, [])
        labels_right = listify(self.labels_right, [])

        if x_link is None:
            df_a = frame_indexing(x[labels_left + labels_right], pairs, 0)
            data1 = tuple([df_a[lbl] for lbl in listify(self.labels_left)])
            data2 = tuple([df_a[lbl] for lbl in listify(self.labels_right)])
        else:
            df_a = frame_indexing(x[labels_left], pairs, 0)
            data1 = tuple([df_a[lbl] for lbl in listify(self.labels_left)])
            df_b = frame_indexing(x_link[labels_right], pairs, 1)
            data2 = tuple([df_b[lbl] for lbl in listify(self.labels_right)])

        results = self._compute(data1, data2)

        return results
Beispiel #3
0
    def _compute(self, pairs, x, x_link=None):

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        # index_time = time.time() - start_time

        features = []

        for feat in self.features:

            # --- DATA1
            # None: no data passed to func
            if feat.labels_left is None:
                data1 = tuple()
            # empty array: empty df with index passed to func
            elif feat.labels_left == []:
                data1 = (df_a_indexed[[]], )
            # else: subset columns and pass tuple of series
            else:
                data1 = tuple(
                    [df_a_indexed[lbl] for lbl in listify(feat.labels_left)])

            # --- DATA2
            # None: no data passed to func
            if feat.labels_right is None:
                data2 = tuple()
            # empty array: empty df with index passed to func
            elif feat.labels_right == []:
                data2 = (df_b_indexed[[]], )
            # else: subset columns and pass tuple of series
            else:
                data2 = tuple(
                    [df_b_indexed[lbl] for lbl in listify(feat.labels_right)])

            result = feat._compute(data1, data2)
            features.append((result, feat.label))

        features = self._union(features, pairs)

        # log timing
        n = pairs.shape[0]
        i_max = '?' if self._i_max is None else self._i_max
        eta = time.time() - start_time
        self._eta.append(eta)
        self._n.append(n)

        # log
        logging.info("comparing [{:d}/{}] - time: {:.2f}s - pairs: {}".format(
            self._i, i_max, eta, n))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            eta_total = np.sum(self._eta)

            logging.info(
                "comparing [{:d}/{}] - time: {:.2f}s - pairs_total: {}".format(
                    self._i, i_max, eta_total, n_total))

        self._i += 1

        return features
Beispiel #4
0
    def _compute(self, pairs, x, x_link=None):

        logging.info("Comparing - start comparing data")

        # start the timer for the comparing step
        start_time = time.time()

        sublabels_left = self._get_labels_left(validate=x)
        df_a_indexed = frame_indexing(x[sublabels_left], pairs, 0)

        if x_link is None:
            sublabels_right = self._get_labels_right(validate=x)
            df_b_indexed = frame_indexing(x[sublabels_right], pairs, 1)
        else:
            sublabels_right = self._get_labels_right(validate=x_link)
            df_b_indexed = frame_indexing(x_link[sublabels_right], pairs, 1)

        # log timing
        index_time = time.time() - start_time

        results = pandas.DataFrame(index=pairs)
        label_num = 0  # make a label is label is None

        for feat, label in self.features:

            lbl1 = feat.labels_left
            lbl2 = feat.labels_right

            data1 = tuple([df_a_indexed[lbl] for lbl in listify(lbl1)])
            data2 = tuple([df_b_indexed[lbl] for lbl in listify(lbl2)])

            c = feat._compute(*tuple(data1 + data2))

            if is_pandas_like(c):
                c = c.values  # convert pandas into numpy

            if label is not None:
                label = listify(label)

            n_cols = 1 if len(c.shape) == 1 else c.shape[1]

            labels = []
            for i in range(0, n_cols):

                label_val = label[i] if label is not None else label_num
                label_num += 1

                labels.append(label_val)

            results[label_val] = c

        # log timing
        total_time = time.time() - start_time

        # log timing
        logging.info("Comparing - computation time: ~{:.2f}s (from which "
                     "indexing: ~{:.2f}s)".format(total_time, index_time))

        # log results
        logf_result = "Comparing - summary shape={}"
        logging.info(logf_result.format(results.shape))

        return results