def test_correct_despite_outliers(self): SAMPLE_COUNT = 10 fine = [[o] for o in self.row_ids] coarse = [[o for o in self.row_ids]] groupings = [fine, coarse] + [self.grouping] * (SAMPLE_COUNT - 2) grouping = find_consensus_grouping(groupings) self._assert_correct(grouping)
def test_simple(self): groupings = self.sample_groupings() grouping = find_consensus_grouping(groupings, debug=True) assert isinstance(grouping, list) for row in grouping: assert isinstance(row, loom.group.Row), row row_ids = set(row.row_id for row in grouping) assert len(row_ids) == len(grouping), 'grouping had duplicate rows' assert_set_equal(set(self.row_ids), row_ids) group_ids = sorted(list(set(row.group_id for row in grouping))) assert_equal( group_ids, range(len(group_ids)), 'group ids were not a contiguous range of integers')
def test_sorting(self): for i in xrange(10): groupings = self.sample_groupings() grouping = find_consensus_grouping(groupings, debug=True) assert_equal( grouping, sorted( grouping, key=lambda x: (x.group_id, -x.confidence, x.row_id))) group_ids = sorted(set(row.group_id for row in grouping)) counts = [ sum(1 for row in grouping if row.group_id == gid) for gid in group_ids ] assert_equal(counts, sorted(counts, reverse=True))
def test_correct_on_noisy_data(self): SAMPLE_COUNT = 10 GROUP_COUNT = len(self.grouping) object_index = { o: g for g, group in enumerate(self.grouping) for o in group } # each object is in the wrong place in one grouping groupings = [] for g in range(SAMPLE_COUNT): groups = self.grouping for o in self.row_ids[g::SAMPLE_COUNT]: t = object_index[o] f = (t + 1) % GROUP_COUNT groups[t].remove(o) groups[f].append(o) groups = filter(len, groups) groupings.append(groups) grouping = find_consensus_grouping(groupings) self._assert_correct(grouping)
def test_correct_on_perfect_data(self): for sample_count in range(1, 11): groupings = [self.grouping] * sample_count grouping = find_consensus_grouping(groupings) self._assert_correct(grouping, confidence=1.0)