Ejemplo n.º 1
0
    def generate_negatives(self,
                           dataset1: 'Dataset',
                           dataset2: 'Dataset',
                           score_function: Callable,
                           num_of_negatives: int = -1,
                           range_in_gt: bool = False):
        """
        Args:
            dataset1 (Dataset): 
            dataset2 (Dataset): 
            score_function (Callable): 
            num_of_negatives (int, optional): Number of negatives to generate. 
                                            Default is -1 which will generate same number of negatives to positives.
            range_in_gt (bool, optional): The negatives will be generated within the range of ids 
                                        in ground truth if it's True,
                                        otherwise range will be the cross product of two datasets. 
                                        Default is False.
        """
        num_of_negatives = len(
            self) if num_of_negatives == -1 else num_of_negatives
        max_heap = []

        for r1, r2 in get_record_pairs(dataset1, dataset2):
            if not self.is_member(r1.id, r2.id) and \
                    (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)):
                s = score_function(r1, r2)
                heapq.heappush(max_heap, (s, r1.id, r2.id))
                if len(max_heap) > num_of_negatives:
                    heapq.heappop(max_heap)

        for d in max_heap:
            r1_id, r2_id = d[1], d[2]
            self.add_negative(r1_id, r2_id)
Ejemplo n.º 2
0
    def generate_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset',
                           score_function: Callable, num_of_negatives: int = -1,
                           range_in_gt: bool = False, exclude_from: 'GroundTruth' = None):
        """
        Args:
            dataset1 (Dataset): Dataset 1.
            dataset2 (Dataset): Dataset 2.
            score_function (Callable): User function, inputs are two :meth:`rltk.record.Record` s, return is a float.
            num_of_negatives (int, optional): Number of negatives to generate. 
                                            Default is -1 which will generate same number of negatives to positives.
            range_in_gt (bool, optional): The negatives will be generated within the range of ids 
                                        in ground truth if it's True,
                                        otherwise range will be the cross product of two datasets. 
                                        Default is False.
            exclude_from (GroundTruth, optional): Exclude the id pair which appears in this ground truth. 
                                            Defaults to None.
                                            This is especially useful when generating negatives for test set \
                                            meanwhile the pairs in train set need to be excluded.
        """
        num_of_negatives = len(self) if num_of_negatives == -1 else num_of_negatives
        max_heap = []

        for r1, r2 in get_record_pairs(dataset1, dataset2):
            if not self.is_member(r1.id, r2.id) and \
                    (not exclude_from or not exclude_from.is_member(r1.id, r2.id)) and \
                    (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)):
                s = score_function(r1, r2)
                heapq.heappush(max_heap, (s, r1.id, r2.id))
                if len(max_heap) > num_of_negatives:
                    heapq.heappop(max_heap)

        for d in max_heap:
            r1_id, r2_id = d[1], d[2]
            self.add_negative(r1_id, r2_id)
Ejemplo n.º 3
0
 def generate_all_negatives(self,
                            dataset1: 'Dataset',
                            dataset2: 'Dataset',
                            range_in_gt: bool = False):
     """
     Args:
         dataset1 (Dataset):
         dataset2 (Dataset):
         range_in_gt (bool, optional):
     """
     for r1, r2 in get_record_pairs(dataset1, dataset2):
         if not self.is_member(r1.id, r2.id) and \
                 (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)):
             self.add_negative(r1.id, r2.id)
Ejemplo n.º 4
0
 def generate_all_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset', range_in_gt: bool = False):
     """
     Args:
         dataset1 (Dataset): Dataset 1.
         dataset2 (Dataset): Dataset 2.
         range_in_gt (bool, optional): The negatives will be generated within the range of ids 
                                     in ground truth if it's True,
                                     otherwise range will be the cross product of two datasets. 
                                     Default is False.
     """
     for r1, r2 in get_record_pairs(dataset1, dataset2):
         if not self.is_member(r1.id, r2.id) and \
                 (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)):
             self.add_negative(r1.id, r2.id)
Ejemplo n.º 5
0
 def generate_all_negatives(self, dataset1: 'Dataset', dataset2: 'Dataset',
                            range_in_gt: bool = False, exclude_from: 'GroundTruth' = None):
     """
     Args:
         dataset1 (Dataset): Dataset 1.
         dataset2 (Dataset): Dataset 2.
         range_in_gt (bool, optional): The negatives will be generated within the range of ids 
                                     in ground truth if it's True,
                                     otherwise range will be the cross product of two datasets. 
                                     Default is False.
         exclude_from (GroundTruth, optional): Exclude the id pair which appears in this ground truth. 
                                         Defaults to None.
                                         This is especially useful when generating negatives for test set \
                                         meanwhile the pairs in train set need to be excluded.
     """
     for r1, r2 in get_record_pairs(dataset1, dataset2):
         if not self.is_member(r1.id, r2.id) and \
                 (not exclude_from or not exclude_from.is_member(r1.id, r2.id)) and \
                 (not range_in_gt or (r1.id in self._gt_id1s and r2.id in self._gt_id2s)):
             self.add_negative(r1.id, r2.id)
Ejemplo n.º 6
0
    def generate_stratified_negatives(self,
                                      dataset1: 'Dataset',
                                      dataset2: 'Dataset',
                                      classify: Callable,
                                      num_of_strata: int,
                                      random_seed: int = None,
                                      num_of_negatives: int = -1,
                                      range_in_gt: bool = False,
                                      exclude_from: 'GroundTruth' = None):
        """
        Args:
            dataset1 (Dataset): Dataset 1.
            dataset2 (Dataset): Dataset 2.
            classify (Callable): User function, inputs are two :meth:`rltk.record.Record` s, 
                                return is an integer which identify which stratum the pair belongs to.
                                The return integer should be in range [0, num_of_strata).
            num_of_strata (int): Number of strata.
            random_seed (int, optional): The seed used by :py:meth:`random.seed`.
            num_of_negatives (int, optional): Number of negatives to generate. 
                                            Default is -1 which will generate same number of negatives to positives.
            range_in_gt (bool, optional): The negatives will be generated within the range of ids 
                                        in ground truth if it's True,
                                        otherwise range will be the cross product of two datasets. 
                                        Default is False.
            exclude_from (GroundTruth, optional): Exclude the id pair which appears in this ground truth. 
                                            Defaults to None.
                                            This is especially useful when generating negatives for test set \
                                            meanwhile the pairs in train set need to be excluded.
        """

        # add positives and negatives to different clusters
        strata = [{'p': [], 'n': []} for _ in range(num_of_strata)]

        # build strata
        for r1, r2 in get_record_pairs(dataset1, dataset2):
            if (range_in_gt and not (r1.id in self._gt_id1s and r2.id in self._gt_id2s)) or \
                    (exclude_from and exclude_from.is_member(r1.id, r2.id)):
                continue
            stratum_id = classify(r1, r2)
            p_n = 'p' if self.is_member(r1.id, r2.id) else 'n'
            strata[stratum_id][p_n].append((r1.id, r2.id))

        # compute weights: p / n
        strata_weights = {}
        for idx, s in enumerate(strata):
            stratum_id = str(idx)
            # nothing to pick
            if len(s['p']) == 0 or len(s['n']) == 0:
                strata_weights[stratum_id] = 0.0
                continue
            strata_weights[stratum_id] = float(len(s['p'])) / len(s['n'])

        # sorting
        sorted_strata_weights = OrderedDict(
            sorted(strata_weights.items(), key=itemgetter(1), reverse=True))

        # find out the number of negatives to pick from each stratum
        total_num = sum([len(s['p']) for s in strata
                         ]) if num_of_negatives == -1 else num_of_negatives
        num_to_pick_from_each_stratum = [0] * num_of_strata
        curr_strata_weights = copy.deepcopy(sorted_strata_weights)
        for stratum_id in sorted_strata_weights.keys():
            if total_num <= 0 or len(curr_strata_weights) == 0:
                break
            weight = sorted_strata_weights[stratum_id]
            idx = int(stratum_id)
            # normalize weights
            denominator = sum([w for w in curr_strata_weights.values()])
            num_to_pick_from_each_stratum[idx] = \
                min(round(total_num * weight / denominator), len(strata[idx]['n']))
            # prep for next round
            total_num -= num_to_pick_from_each_stratum[idx]
            curr_strata_weights.popitem(last=False)

        # pick negatives
        if random_seed:
            random.seed(random_seed)
        for idx, num in enumerate(num_to_pick_from_each_stratum):
            negs = random.sample(strata[idx]['n'], num)
            for n in negs:
                self.add_negative(n[0], n[1])
Ejemplo n.º 7
0
    def generate_stratified_negatives(self,
                                      dataset1: 'Dataset',
                                      dataset2: 'Dataset',
                                      classify: Callable,
                                      num_of_strata: int,
                                      random_seed: int = None,
                                      num_of_negatives: int = -1,
                                      range_in_gt: bool = False):
        """
        Args:
            dataset1 (Dataset):
            dataset2 (Dataset):
            classify (Callable):
            num_of_strata (int):
            random_seed (int, optional):
            num_of_negatives (int, optional):
            range_in_gt (bool, optional):
        """

        # add positives and negatives to different clusters
        strata = [{'p': [], 'n': []} for _ in range(num_of_strata)]

        # build strata
        for r1, r2 in get_record_pairs(dataset1, dataset2):
            if range_in_gt and not (r1.id in self._gt_id1s
                                    and r2.id in self._gt_id2s):
                continue
            stratum_id = classify(r1, r2)
            p_n = 'p' if self.is_member(r1.id, r2.id) else 'n'
            strata[stratum_id][p_n].append((r1.id, r2.id))

        # compute weights: p / n
        strata_weights = {}
        for idx, s in enumerate(strata):
            stratum_id = str(idx)
            # nothing to pick
            if s['p'] == 0 or s['n'] == 0:
                strata_weights[stratum_id] = 0.0
                continue
            strata_weights[stratum_id] = float(len(s['p'])) / len(s['n'])

        # sorting
        sorted_strata_weights = OrderedDict(
            sorted(strata_weights.items(), key=itemgetter(1), reverse=True))

        # find out the number of negatives to pick from each stratum
        total_num = sum([len(s['p']) for s in strata
                         ]) if num_of_negatives == -1 else num_of_negatives
        num_to_pick_from_each_stratum = [0] * num_of_strata
        curr_strata_weights = copy.deepcopy(sorted_strata_weights)
        for stratum_id in sorted_strata_weights.keys():
            if total_num <= 0 or len(curr_strata_weights) == 0:
                break
            weight = sorted_strata_weights[stratum_id]
            idx = int(stratum_id)
            # normalize weights
            denominator = sum([w for w in curr_strata_weights.values()])
            num_to_pick_from_each_stratum[idx] = \
                min(round(total_num * weight / denominator), len(strata[idx]['n']))
            # prep for next round
            total_num -= num_to_pick_from_each_stratum[idx]
            curr_strata_weights.popitem(last=False)

        # pick negatives
        if random_seed:
            random.seed(random_seed)
        for idx, num in enumerate(num_to_pick_from_each_stratum):
            negs = random.sample(strata[idx]['n'], num)
            for n in negs:
                self.add_negative(n[0], n[1])