コード例 #1
0
def random_pairs_without_replacement_large_frames(
        n, shape, random_state=None):
    """Make a sample of random pairs with replacement"""

    n_max = max_pairs(shape)

    sample = np.array([])

    # Run as long as the number of pairs is less than the requested number
    # of pairs n.
    while len(sample) < n:

        # The number of pairs to sample (sample twice as much record pairs
        # because the duplicates are dropped).
        n_sample_size = (n - len(sample)) * 2
        sample = random_state.randint(n_max, size=n_sample_size)

        # concatenate pairs and deduplicate
        pairs_non_unique = np.append(sample, sample)
        sample = _unique_rows_numpy(pairs_non_unique)

    # return 2d indices
    if len(shape) == 1:
        return _map_triu_1d_on_2d(sample[0:n], shape[0])
    else:
        return np.unravel_index(sample[0:n], shape)
コード例 #2
0
ファイル: indexing.py プロジェクト: kiran-raja/recordlinkage
    def _dedup_index(self, df_a):

        shape = (len(df_a), )

        # with replacement
        if self.replace:
            pairs = random_pairs_with_replacement(self.n, shape,
                                                  self.random_state)

        # without replacement
        else:

            n_max = max_pairs(shape)

            if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max:
                raise ValueError("n must be a integer satisfying 0<n<=%s" %
                                 n_max)

            # large dataframes
            if n_max < 1e6:
                pairs = random_pairs_without_replacement_small_frames(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_large_frames(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_a.index.values]
        labels = pairs
        names = [df_a.index.name, df_a.index.name]

        return pandas.MultiIndex(levels=levels,
                                 labels=labels,
                                 names=names,
                                 verify_integrity=False)
コード例 #3
0
ファイル: indexing.py プロジェクト: kiran-raja/recordlinkage
    def _link_index(self, df_a, df_b):

        n_max = max_pairs((df_a, df_b))

        if n_max > 1e7:
            logging.warn(
                "The number of record pairs is large. Consider a different "
                "indexation algorithm for better performance. ")

        return pandas.MultiIndex.from_product(
            [df_a.index.values, df_b.index.values],
            names=[df_a.index.name, df_b.index.name])
コード例 #4
0
ファイル: indexing.py プロジェクト: kiran-raja/recordlinkage
    def _dedup_index(self, df_a):

        n_max = max_pairs((df_a))

        if n_max > 1e7:
            logging.warn(
                "The number of record pairs is large. Consider a different "
                "indexation algorithm for better performance. ")

        levels = [df_a.index.values, df_a.index.values]
        labels = numpy.triu_indices(len(df_a.index), k=1)
        names = [df_a.index.name, df_a.index.name]

        return pandas.MultiIndex(levels=levels,
                                 labels=labels,
                                 names=names,
                                 verify_integrity=False)
コード例 #5
0
def random_pairs_with_replacement(n, shape, random_state=None):
    """make random record pairs"""

    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(random_state)

    n_max = max_pairs(shape)

    if n_max <= 0:
        raise ValueError('n_max must be larger than 0')

    # make random pairs
    indices = random_state.randint(0, n_max, n)

    if len(shape) == 1:
        return _map_triu_1d_on_2d(indices, shape[0])
    else:
        return np.unravel_index(indices, shape)
コード例 #6
0
def random_pairs_without_replacement_small_frames(
        n, shape, random_state=None):

    n_max = max_pairs(shape)

    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(random_state)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    # make a sample without replacement
    sample = random_state.choice(
        np.arange(n_max), n, replace=False)

    # return 2d indices
    if len(shape) == 1:
        return _map_triu_1d_on_2d(sample, shape[0])
    else:
        return np.unravel_index(sample, shape)
コード例 #7
0
ファイル: base.py プロジェクト: kwombach/recordlinkage
    def index(self, x, x_link=None):
        """Make an index of record pairs.

        Parameters
        ----------
        x: pandas.DataFrame
            A pandas DataFrame. When `x_link` is None, the algorithm makes
            record pairs within the DataFrame. When `x_link` is not empty,
            the algorithm makes pairs between `x` and `x_link`.
        x_link: pandas.DataFrame, optional
            A second DataFrame to link with the DataFrame x.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair contains
            the index labels of two records.

        """
        if not self.algorithms:
            raise ValueError("No algorithms given.")

        # start timing
        start_time = time.time()

        pairs = None
        for cl_alg in self.algorithms:
            pairs_i = cl_alg.index(x, x_link)

            if pairs is None:
                pairs = pairs_i
            else:
                pairs = pairs.union(pairs_i)

        if x_link is not None:
            n_max = max_pairs((x, x_link))
        else:
            n_max = max_pairs(x)

        # store the number of pairs
        n = pairs.shape[0]
        eta = time.time() - start_time
        rr = 1 - n / n_max
        i_max = '?' if self._i_max is None else self._i_max

        self._eta.append(eta)
        self._n.append(n)
        self._n_max.append(n_max)

        # log
        logging.info("indexing [{:d}/{}] - time: {:.2f}s - pairs: {:d}/{:d} - "
                     "rr: {:0.5f}".format(self._i, i_max, eta, n, n_max, rr))

        # log total
        if self._output_log_total:

            n_total = np.sum(self._n)
            n_max_total = np.sum(self._n_max)
            rr_avg = 1 - n_total / n_max_total
            eta_total = np.sum(self._eta)

            logging.info("indexing [{:d}/{}] - time: {:.2f}s - "
                         "pairs_total: {:d}/{:d} - rr_total: {:0.5f}".format(
                             self._i, i_max, eta_total, n_total, n_max_total,
                             rr_avg))

        self._i += 1

        return pairs
コード例 #8
0
ファイル: base.py プロジェクト: dmescherina/recordlinkage
    def index(self, x, x_link=None):
        """Make an index of record pairs.

        Use a custom function to make record pairs of one or two dataframes.
        Each function should return a pandas.MultiIndex with record pairs.

        Parameters
        ----------
        x: pandas.DataFrame
            A pandas DataFrame. When `x_link` is None, the algorithm makes
            record pairs within the DataFrame. When `x_link` is not empty,
            the algorithm makes pairs between `x` and `x_link`.
        x_link: pandas.DataFrame, optional
            A second DataFrame to link with the DataFrame x.

        Returns
        -------
        pandas.MultiIndex
            A pandas.MultiIndex with record pairs. Each record pair contains
            the index labels of two records.

        """

        if x is None:  # error
            raise ValueError("provide at least one dataframe")
        elif x_link is not None:  # linking (two arg)
            x = (x, x_link)
        elif isinstance(x, (list, tuple)):  # dedup or linking (single arg)
            x = tuple(x)
        else:  # dedup (single arg)
            x = (x, )

        if self.verify_integrity:

            for df in x:
                self._verify_integrety(df)

        # start timing
        start_time = time.time()

        # linking
        if not self._deduplication(x):
            logging.info("Indexing - start indexing two DataFrames")

            pairs = self._link_index(*x)
            names = self._make_index_names(x[0].index.name, x[1].index.name)

        # deduplication
        else:
            logging.info("Indexing - start indexing single DataFrame")

            pairs = self._dedup_index(*x)
            names = self._make_index_names(x[0].index.name, x[0].index.name)

        pairs.rename(names, inplace=True)

        # store the number of pairs
        self._n.append(pairs.shape[0])
        self._n_max.append(max_pairs(x))

        # summary
        n = len(pairs)
        rr = 1 - self._n[-1] / self._n_max[-1]
        rr_avg = 1 - np.sum(self._n) / np.sum(self._n_max)

        # log timing
        logf_time = "Indexing - computation time: ~{:.2f}s"
        logging.info(logf_time.format(time.time() - start_time))

        # log results
        logf_result = "Indexing - summary n={:d}, " \
            "reduction_ratio={:0.5f}, reduction_ratio_mean={:0.5f}"
        logging.info(logf_result.format(n, rr, rr_avg))

        return pairs