def test_mutability_1(self): rs0 = RangeSet("2-5") rs1 = RangeSet("0-1") rn0 = RangeSetND([[rs0, rs1]]) #, copy_rangeset=False) self.assertEqual(str(rn0), "2-5; 0-1\n") rs2 = RangeSet("6-7") rs3 = RangeSet("2-3") rn1 = RangeSetND([[rs2, rs3]]) #, copy_rangeset=False) rn0.update(rn1) self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n") # check mutability safety self.assertEqual(str(rs0), "2-5") self.assertEqual(str(rs1), "0-1") self.assertEqual(str(rs2), "6-7") self.assertEqual(str(rs3), "2-3") # reverse check rs1.add(2) self.assertEqual(str(rs1), "0-2") rs3.add(4) self.assertEqual(str(rs3), "2-4") self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n") self.assertEqual(str(rn1), "6-7; 2-3\n") rn1.update([[rs2, rs3]]) self.assertEqual(str(rn1), "6-7; 2-4\n") self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n")
def _prepare_token_array(self): # TODO: the lazy init should move to somewhere else # clear the suffix array and LCP array cache self.cached_suffix_array = None token_array_position = 0 for idx, witness in enumerate(self.witnesses): # print("witness.tokens",witness.tokens()) witness_range = RangeSet() witness_range.add_range(self.counter, self.counter + len(witness.tokens())) # the extra one is for the marker token self.counter += len(witness.tokens()) + 1 self.witness_ranges[witness.sigil] = witness_range # remember get tokens twice sigil = witness.sigil for token in witness.tokens(): token.token_data['_sigil'] = sigil token.token_data[ '_token_array_position'] = token_array_position token_array_position += 1 self.token_array.extend(witness.tokens()) # # add marker token self.token_array.append( Token({ "n": '$' + str(idx), '_sigil': sigil })) token_array_position += 1 self.token_array.pop() # remove last marker
def test_iand(self): """test RangeSet.__iand__()""" r1 = RangeSet("1,3-9,14-21,30-39,42") r2 = RangeSet("2-5,10-32,35,40-41") r1 &= r2 self.assertEqual(len(r1), 15) self.assertEqual(str(r1), "3-5,14-21,30-32,35")
def test_ior(self): """test RangeSet.__ior__()""" r1 = RangeSet("1,3-9,14-21,30-39,42") r2 = RangeSet("2-5,10-32,35,40-41") r1 |= r2 self.assertEqual(len(r1), 42) self.assertEqual(str(r1), "1-42")
def test_isub(self): """test RangeSet.__isub__()""" r1 = RangeSet("1,3-9,14-21,30-39,42") r2 = RangeSet("2-5,10-32,35,40-41") r1 -= r2 self.assertEqual(len(r1), 12) self.assertEqual(str(r1), "1,6-9,33-34,36-39,42")
def test_ixor(self): """test RangeSet.__ixor__()""" r1 = RangeSet("1,3-9,14-21,30-39,42") r2 = RangeSet("2-5,10-32,35,40-41") r1 ^= r2 self.assertEqual(len(r1), 27) self.assertEqual(str(r1), "1-2,6-13,22-29,33-34,36-42")
def _extract_conf(self, cfg): """Extract cluster nodes configuration""" conf = {"default": {}} for key, val in cfg.iteritems(): if key == 'default': conf['default'].update(val) elif isinstance(val, dict): if isinstance(key, int): rset = RangeSet.fromone(key) else: try: rset = RangeSet(key) except RangeSetParseError as err: _LOGGER.warning( "Error in configuration file:" " %s. Ingnoring this part", err) continue for idx in rset: conf[idx] = val else: conf['default'][key] = val try: conf = clustdock.format_dict(conf, **self.__dict__) except KeyError: _LOGGER.exception("Key not found:") return conf
def testClear(self): """test RangeSet.clear()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) self.assertEqual(str(r1), "1-100,102,105-242,800") r1.clear() self.assertEqual(len(r1), 0) self.assertEqual(str(r1), "")
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1")) self.assertEquals(RangeSet("17-29"), collation.get_range_for_witness("W2"))
def testClear(self): """test RangeSet.clear()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) self.assertEqual(str(r1), "1-100,102,105-242,800") r1.clear() self.assertEqual(len(r1), 0) self.assertEqual(str(r1), "")
def test_vectors(self): rn = RangeSetND([["0-10", "1-2"], ["5-60", "2"]]) # vectors() should perform automatic folding self.assertEqual([[RangeSet("0-60"), RangeSet("2")], [RangeSet("0-10"), RangeSet("1")]], list(rn.vectors())) self.assertEqual(str(rn), "0-60; 2\n0-10; 1\n") self.assertEqual(len(rn), 72)
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def get_compute_rangeset(self): """ returns rangeset of compute ids """ rset = RangeSet() for child in self.get_children(): mat = re.match(self._comp_regex, child) if mat: mdic = mat.groupdict() rset.union_update(RangeSet(str(mdic['id']))) return rset
def test_blocks_failing_transposition_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-1, 9-10")) block2 = Block(RangeSet("3-4, 6-7")) block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks)
def get_compute_rangeset(self): """ returns rangeset of compute ids """ rset = RangeSet() for child in self.get_children(): mat = re.match(self._comp_regex, child) if mat: mdic = mat.groupdict() rset.union_update(RangeSet(str(mdic["id"]))) return rset
def test_2(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") collation.add_plain_witness("W3", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10, 14-18")), blocks) # in the in the bleach self.assertIn(Block(RangeSet("11-12, 19-20")), blocks) # in the
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertEquals(RangeSet("0-14"), token_index.get_range_for_witness("W1")) self.assertEquals(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
def test_pickle_current(self): """test RangeSet pickling (current version)""" dump = pickle.dumps(RangeSet("1-100")) self.assertNotEqual(dump, None) rngset = pickle.loads(dump) self.assertEqual(rngset, RangeSet("1-100")) self.assertEqual(str(rngset), "1-100") self.assertEqual(rngset[0], 1) self.assertEqual(rngset[1], 2) self.assertEqual(rngset[-1], 100)
def calculate_non_overlapping_range_with(self, occupied): # convert block occurrences into ranges potential_block_range = RangeSet() for occurrence in self.block_occurrences(): potential_block_range.add_range(occurrence, occurrence + self.minimum_block_length) #check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # no overlap, return complete block_range return potential_block_range # There is overlap with occupied range # we need to deal with it real_block_range = RangeSet() for lower in potential_block_range.contiguous(): # TODO: what I really want here is a find first over a generator upper = [x for x in block_intersection.contiguous() if x[0] >= lower[0]] if upper: lower = lower[0] upper = upper[0][0] if lower != upper: real_block_range.add_range(lower, upper) if not real_block_range: # There is complete overlap, so return None return None # Assert: check that the first slice is not larger than potential block length! first_range = real_block_range.contiguous().next() if first_range[-1]-first_range[0]+1>self.minimum_block_length: raise PartialOverlapException() return real_block_range
def testFromListConstructor(self): """test RangeSet.fromlist() constructor""" rgs = RangeSet.fromlist([ "3", "5-8", "1" ]) self.assertEqual(str(rgs), "1,3,5-8") self.assertEqual(len(rgs), 6) rgs = RangeSet.fromlist([ RangeSet("3"), RangeSet("5-8"), RangeSet("1") ]) self.assertEqual(str(rgs), "1,3,5-8") self.assertEqual(len(rgs), 6) rgs = RangeSet.fromlist([set([3,5,6,7,8,1])]) self.assertEqual(str(rgs), "1,3,5-8") self.assertEqual(len(rgs), 6)
def testDiscard(self): """test RangeSet.discard()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r1.discard(100) self.assertEqual(len(r1), 239) self.assertEqual(str(r1), "1-99,102,105-242,800") r1.discard(101) # should not raise KeyError # test remove integer-castable type (convenience) r1.remove("106") r1.discard("foo")
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 20")), blocks) # F
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 21")), blocks) # F
def testRemove(self): """test RangeSet.remove()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r1.remove(100) self.assertEqual(len(r1), 239) self.assertEqual(str(r1), "1-99,102,105-242,800") self.assertRaises(KeyError, r1.remove, 101) # test remove integer-castable type (convenience) r1.remove("106") # non integer castable cases raise ValueError (documented since 1.6) self.assertRaises(ValueError, r1.remove, "foo")
def test_simple(self): # Test constructors self._testRS(None, "", 0) self._testRS([["0-10"], ["40-60"]], "0-10,40-60\n", 32) self._testRS([["0-2", "1-2"], ["10", "3-5"]], "0-2; 1-2\n10; 3-5\n", 9) self._testRS([[0, 1], [0, 2], [2, 2], [2, 1], [1, 1], [1, 2], [10, 4], [10, 5], [10, 3]], "0-2; 1-2\n10; 3-5\n", 9) self._testRS([(0, 4), (0, 5), (1, 4), (1, 5)], "0-1; 4-5\n", 4) # construct with copy_rangeset=False r0 = RangeSet("0-10,30-40,50") r1 = RangeSet("200-202") rn = RangeSetND([[r0, r1]], copy_rangeset=False) self.assertEqual(str(rn), "0-10,30-40,50; 200-202\n") self.assertEqual(len(rn), 69)
def add_witness(self, witnessdata): # clear the suffix array and LCP array cache self.cached_suffix_array = None witness = Witness(witnessdata) self.witnesses.append(witness) witness_range = RangeSet() witness_range.add_range(self.counter, self.counter+len(witness.tokens())) # the extra one is for the marker token self.counter += len(witness.tokens()) +2 # $ + number self.witness_ranges[witness.sigil] = witness_range if not self.combined_string == "": self.combined_string += " $"+str(len(self.witnesses)-1)+ " " self.combined_string += witness.content
def testIntersectionLength(self): """test RangeSet intersection/length""" r1 = RangeSet("115-117,130,166-170,4780-4999") self.assertEqual(len(r1), 229) r2 = RangeSet("116-117,130,4781-4999") self.assertEqual(len(r2), 222) res = r1.intersection(r2) self.assertEqual(len(res), 222) r1 = RangeSet("115-200") self.assertEqual(len(r1), 86) r2 = RangeSet("116-117,119,123-131,133,149,199") self.assertEqual(len(r2), 15) res = r1.intersection(r2) self.assertEqual(len(res), 15) # StopIteration test r1 = RangeSet("115-117,130,166-170,4780-4999,5003") self.assertEqual(len(r1), 230) r2 = RangeSet("116-117,130,4781-4999") self.assertEqual(len(r2), 222) res = r1.intersection(r2) self.assertEqual(len(res), 222) # StopIteration test2 r1 = RangeSet("130,166-170,4780-4999") self.assertEqual(len(r1), 226) r2 = RangeSet("116-117") self.assertEqual(len(r2), 2) res = r1.intersection(r2) self.assertEqual(len(res), 0)
def test_mutability_2(self): rs0 = RangeSet("2-5") rs1 = RangeSet("0-1") rn0 = RangeSetND([[rs0, rs1]]) #, copy_rangeset=False) self.assertEqual(str(rn0), "2-5; 0-1\n") rs2 = RangeSet("6-7") rs3 = RangeSet("2-3") rn0.update([[rs2, rs3]]) self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n") rs3.add(4) self.assertEqual(str(rs3), "2-4") self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n")
def testDiscard(self): """test RangeSet.discard()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r1.discard(100) self.assertEqual(len(r1), 239) self.assertEqual(str(r1), "1-99,102,105-242,800") r1.discard(101) # should not raise KeyError # test remove integer-castable type (convenience) r1.remove("106") r1.discard("foo")
def _prepare_token_array(self): # TODO: the lazy init should move to somewhere else # clear the suffix array and LCP array cache self.cached_suffix_array = None for idx, witness in enumerate(self.witnesses): witness_range = RangeSet() witness_range.add_range(self.counter, self.counter+len(witness.tokens())) # the extra one is for the marker token self.counter += len(witness.tokens()) + 1 self.witness_ranges[witness.sigil] = witness_range if self.token_array: # add marker token self.token_array.append(Token({"n":"$"+str(idx-1)})) # remember get tokens twice self.token_array.extend(witness.tokens())
def testIsSuperSet(self): """test RangeSet.issuperset()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r2 = RangeSet("3-98,140-199,800") self.assertEqual(len(r2), 157) self.assertTrue(r1.issuperset(r1)) self.assertTrue(r1.issuperset(r2)) self.assertTrue(r1 >= r1) self.assertTrue(r1 > r2) self.assertFalse(r2 > r1) r2 = RangeSet("3-98,140-199,243,800") self.assertEqual(len(r2), 158) self.assertFalse(r1.issuperset(r2)) self.assertFalse(r1 > r2)
def add_witness(self, witnessdata): # clear the suffix array and LCP array cache self.cached_suffix_array = None witness = Witness(witnessdata) self.witnesses.append(witness) witness_range = RangeSet() witness_range.add_range(self.counter, self.counter + len(witness.tokens())) # the extra one is for the marker token self.counter += len(witness.tokens()) + 2 # $ + number self.witness_ranges[witness.sigil] = witness_range if len(self.witnesses) > 1: self.combined_tokens.append('$') self.combined_tokens.append(str(len(self.witnesses) - 1)) for tk in witness.tokens(): self.combined_tokens.append(tk.token_string)
def testIsSubSet(self): """test RangeSet.issubset()""" r1 = RangeSet("1-100,102,105-242,800-900/2") r2 = RangeSet("3,800,802,804,888") self.assertTrue(r2.issubset(r2)) self.assertTrue(r2.issubset(r1)) self.assertTrue(r2 <= r1) self.assertTrue(r2 < r1) self.assertTrue(r1 > r2) self.assertFalse(r1 < r2) self.assertFalse(r1 <= r2) self.assertFalse(r2 >= r1) # since v1.6, padding is ignored when computing set operations r1 = RangeSet("1-100") r2 = RangeSet("001-100") self.assertTrue(r1.issubset(r2))
def _prepare_token_array(self): # TODO: the lazy init should move to somewhere else # clear the suffix array and LCP array cache self.cached_suffix_array = None for idx, witness in enumerate(self.witnesses): witness_range = RangeSet() witness_range.add_range(self.counter, self.counter + len(witness.tokens())) # the extra one is for the marker token self.counter += len(witness.tokens()) + 1 self.witness_ranges[witness.sigil] = witness_range if self.token_array: # add marker token self.token_array.append(Token({"n": "$" + str(idx - 1)})) # remember get tokens twice self.token_array.extend(witness.tokens())
def testIterator(self): """test RangeSet iterator""" matches = [ 1, 3, 4, 5, 6, 7, 8, 11 ] rgs = RangeSet.fromlist([ "11", "3", "5-8", "1", "4" ]) cnt = 0 for rg in rgs: self.assertEqual(rg, matches[cnt]) cnt += 1 self.assertEqual(cnt, len(matches)) # with padding rgs = RangeSet.fromlist([ "011", "003", "005-008", "001", "004" ]) cnt = 0 for rg in rgs: self.assertTrue(type(rg) is int) self.assertEqual(rg, matches[cnt]) cnt += 1 self.assertEqual(cnt, len(matches))
def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 5-7")) # a c b self.assertIn(block1, blocks)
def testIterator(self): """test RangeSet iterator""" matches = [1, 3, 4, 5, 6, 7, 8, 11] rgs = RangeSet.fromlist(["11", "3", "5-8", "1", "4"]) cnt = 0 for rg in rgs: self.assertEqual(rg, matches[cnt]) cnt += 1 self.assertEqual(cnt, len(matches)) # with padding rgs = RangeSet.fromlist(["011", "003", "005-008", "001", "004"]) cnt = 0 for rg in rgs: self.assertTrue(type(rg) is int) self.assertEqual(rg, matches[cnt]) cnt += 1 self.assertEqual(cnt, len(matches))
def testStringIterator(self): """test RangeSet string iterator striter()""" matches = [ 1, 3, 4, 5, 6, 7, 8, 11 ] rgs = RangeSet.fromlist([ "11", "3", "5-8", "1", "4" ]) cnt = 0 for rg in rgs.striter(): self.assertEqual(rg, str(matches[cnt])) cnt += 1 self.assertEqual(cnt, len(matches)) # with padding rgs = RangeSet.fromlist([ "011", "003", "005-008", "001", "004" ]) cnt = 0 for rg in rgs.striter(): self.assertTrue(type(rg) is str) self.assertEqual(rg, "%0*d" % (3, matches[cnt])) cnt += 1 self.assertEqual(cnt, len(matches))
def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 4-6")) self.assertEqual([block1], blocks)
def testStringIterator(self): """test RangeSet string iterator striter()""" matches = [1, 3, 4, 5, 6, 7, 8, 11] rgs = RangeSet.fromlist(["11", "3", "5-8", "1", "4"]) cnt = 0 for rg in rgs.striter(): self.assertEqual(rg, str(matches[cnt])) cnt += 1 self.assertEqual(cnt, len(matches)) # with padding rgs = RangeSet.fromlist(["011", "003", "005-008", "001", "004"]) cnt = 0 for rg in rgs.striter(): self.assertTrue(type(rg) is str) self.assertEqual(rg, "%0*d" % (3, matches[cnt])) cnt += 1 self.assertEqual(cnt, len(matches))
def testConstructorIterate(self): """test RangeSet(iterable) constructor""" # from list rgs = RangeSet([3,5,6,7,8,1]) self.assertEqual(str(rgs), "1,3,5-8") self.assertEqual(len(rgs), 6) rgs.add(10) self.assertEqual(str(rgs), "1,3,5-8,10") self.assertEqual(len(rgs), 7) # from set rgs = RangeSet(set([3,5,6,7,8,1])) self.assertEqual(str(rgs), "1,3,5-8") self.assertEqual(len(rgs), 6) # from RangeSet r1 = RangeSet("1,3,5-8") rgs = RangeSet(r1) self.assertEqual(str(rgs), "1,3,5-8") self.assertEqual(len(rgs), 6)
def testUpdate(self): """test RangeSet.update()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r2 = RangeSet("243-799,1924-1984") self.assertEqual(len(r2), 618) r1.update(r2) self.assertEqual(type(r1), RangeSet) self.assertEqual(r1.padding, None) self.assertEqual(len(r1), 240 + 618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984") r1 = RangeSet("1-100,102,105-242,800") r1.union_update(r2) self.assertEqual(len(r1), 240 + 618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
def testCopy(self): """test RangeSet.copy()""" rangeset = RangeSet("115-117,130,166-170,4780-4999") self.assertEqual(len(rangeset), 229) self.assertEqual(str(rangeset), "115-117,130,166-170,4780-4999") r1 = rangeset.copy() r2 = rangeset.copy() self.assertEqual(rangeset, r1) # content equality r1.remove(166) self.assertEqual(len(rangeset), len(r1) + 1) self.assertNotEqual(rangeset, r1) self.assertEqual(str(rangeset), "115-117,130,166-170,4780-4999") self.assertEqual(str(r1), "115-117,130,167-170,4780-4999") r2.update(RangeSet("118")) self.assertNotEqual(rangeset, r2) self.assertNotEqual(r1, r2) self.assertEqual(len(rangeset) + 1, len(r2)) self.assertEqual(str(rangeset), "115-117,130,166-170,4780-4999") self.assertEqual(str(r1), "115-117,130,167-170,4780-4999") self.assertEqual(str(r2), "115-118,130,166-170,4780-4999")
def testCopy(self): """test RangeSet.copy()""" rangeset = RangeSet("115-117,130,166-170,4780-4999") self.assertEqual(len(rangeset), 229) self.assertEqual(str(rangeset), "115-117,130,166-170,4780-4999") r1 = rangeset.copy() r2 = rangeset.copy() self.assertEqual(rangeset, r1) # content equality r1.remove(166) self.assertEqual(len(rangeset), len(r1) + 1) self.assertNotEqual(rangeset, r1) self.assertEqual(str(rangeset), "115-117,130,166-170,4780-4999") self.assertEqual(str(r1), "115-117,130,167-170,4780-4999") r2.update(RangeSet("118")) self.assertNotEqual(rangeset, r2) self.assertNotEqual(r1, r2) self.assertEqual(len(rangeset) + 1, len(r2)) self.assertEqual(str(rangeset), "115-117,130,166-170,4780-4999") self.assertEqual(str(r1), "115-117,130,167-170,4780-4999") self.assertEqual(str(r2), "115-118,130,166-170,4780-4999")
def _iterbase(self): """Iterator on single, one-item NodeSetBase objects.""" for pat, ivec, pad, autostep in self._iter(): rset = None # 'no node index' by default if ivec is not None: assert len(ivec) > 0 if len(ivec) == 1: rset = RangeSet.fromone(ivec[0], pad[0] or 0, autostep) else: rset = RangeSetND([ivec], pad, autostep) yield NodeSetBase(pat, rset)
def testFromOneConstructor(self): """test RangeSet.fromone() constructor""" rgs = RangeSet.fromone(42) self.assertEqual(str(rgs), "42") self.assertEqual(len(rgs), 1) # also support slice object (v1.6+) rgs = RangeSet.fromone(slice(42)) self.assertEqual(str(rgs), "0-41") self.assertEqual(len(rgs), 42) self.assertRaises(ValueError, RangeSet.fromone, slice(12, None)) rgs = RangeSet.fromone(slice(42, 43)) self.assertEqual(str(rgs), "42") self.assertEqual(len(rgs), 1) rgs = RangeSet.fromone(slice(42, 48)) self.assertEqual(str(rgs), "42-47") self.assertEqual(len(rgs), 6) rgs = RangeSet.fromone(slice(42, 57, 2)) self.assertEqual(str(rgs), "42,44,46,48,50,52,54,56") rgs.autostep = 3 self.assertEqual(str(rgs), "42-56/2") self.assertEqual(len(rgs), 8)
def test_mutability_1(self): rs0 = RangeSet("2-5") rs1 = RangeSet("0-1") rn0 = RangeSetND([[rs0, rs1]]) #, copy_rangeset=False) self.assertEqual(str(rn0), "2-5; 0-1\n") rs2 = RangeSet("6-7") rs3 = RangeSet("2-3") rn1 = RangeSetND([[rs2, rs3]]) #, copy_rangeset=False) rn0.update(rn1) self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n") # check mutability safety self.assertEqual(str(rs0), "2-5") self.assertEqual(str(rs1), "0-1") self.assertEqual(str(rs2), "6-7") self.assertEqual(str(rs3), "2-3") # reverse check rs1.add(2) self.assertEqual(str(rs1), "0-2") rs3.add(4) self.assertEqual(str(rs3), "2-4") self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n") self.assertEqual(str(rn1), "6-7; 2-3\n") rn1.update([[rs2, rs3]]) self.assertEqual(str(rn1), "6-7; 2-4\n") self.assertEqual(str(rn0), "2-5; 0-1\n6-7; 2-3\n")
def testUnion(self): """test RangeSet.union()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r2 = RangeSet("243-799,1924-1984") self.assertEqual(len(r2), 618) r3 = r1.union(r2) self.assertEqual(type(r3), RangeSet) self.assertEqual(r3.padding, None) self.assertEqual(len(r3), 240+618) self.assertEqual(str(r3), "1-100,102,105-800,1924-1984") r4 = r1 | r2 self.assertEqual(len(r4), 240+618) self.assertEqual(str(r4), "1-100,102,105-800,1924-1984") # test with overlap r2 = RangeSet("200-799") r3 = r1.union(r2) self.assertEqual(len(r3), 797) self.assertEqual(str(r3), "1-100,102,105-800") r4 = r1 | r2 self.assertEqual(len(r4), 797) self.assertEqual(str(r4), "1-100,102,105-800")
def nsiter(self): """Object-based NodeSet iterator on single nodes.""" for pat, ivec, pad, autostep in self._iter(): nodeset = self.__class__() if ivec is not None: if len(ivec) == 1: nodeset._add_new(pat, \ RangeSet.fromone(ivec[0], pad[0] or 0)) else: nodeset._add_new(pat, RangeSetND([ivec], None, autostep)) else: nodeset._add_new(pat, None) yield nodeset
def testIntersectStep(self): """test RangeSet with more intersections of ranges""" r1 = RangeSet("4-34/2") r2 = RangeSet("28-42/2") r1.intersection_update(r2) self.assertEqual(str(r1), "28,30,32,34") self.assertEqual(len(r1), 4) r1 = RangeSet("4-34/2") r2 = RangeSet("27-42/2") r1.intersection_update(r2) self.assertEqual(str(r1), "") self.assertEqual(len(r1), 0) r1 = RangeSet("2-60/3", autostep=3) r2 = RangeSet("3-50/2", autostep=3) r1.intersection_update(r2) self.assertEqual(str(r1), "5-47/6") self.assertEqual(len(r1), 8)
def _prepare_token_array(self): # TODO: the lazy init should move to somewhere else # clear the suffix array and LCP array cache self.cached_suffix_array = None token_array_position = 0 for idx, witness in enumerate(self.witnesses): # print("witness.tokens",witness.tokens()) witness_range = RangeSet() witness_range.add_range(self.counter, self.counter + len(witness.tokens())) # the extra one is for the marker token self.counter += len(witness.tokens()) + 1 self.witness_ranges[witness.sigil] = witness_range # remember get tokens twice sigil = witness.sigil for token in witness.tokens(): token.token_data['_sigil'] = sigil token.token_data['_token_array_position'] = token_array_position token_array_position += 1 self.token_array.extend(witness.tokens()) # # add marker token self.token_array.append(Token({"n": '$' + str(idx), '_sigil': sigil})) token_array_position += 1 self.token_array.pop() # remove last marker
def get_non_overlapping_repeating_blocks(self): extended_suffix_array = self.collation.to_extended_suffix_array() potential_blocks = extended_suffix_array.split_lcp_array_into_intervals() self.filter_potential_blocks(potential_blocks) # step 3: sort the blocks based on depth (number of repetitions) first, # second length of LCP interval, # third sort on parent LCP interval occurrences. sorted_blocks_on_priority = sorted(potential_blocks, key=attrgetter("number_of_occurrences", "minimum_block_length", "number_of_siblings"), reverse=True) # step 4: select the definitive blocks occupied = RangeSet() real_blocks = [] for potential_block in sorted_blocks_on_priority: # print(potential_block.info()) try: non_overlapping_range = potential_block.calculate_non_overlapping_range_with(occupied) if non_overlapping_range: # print("Selecting: "+str(potential_block)) occupied.union_update(non_overlapping_range) real_blocks.append(Block(non_overlapping_range)) except PartialOverlapException: # print("Skip due to conflict: "+str(potential_block)) while potential_block.minimum_block_length > 1: # retry with a different length: one less for idx in range(potential_block.start+1, potential_block.end+1): potential_block.LCP[idx] -= 1 potential_block.length -= 1 try: non_overlapping_range = potential_block.calculate_non_overlapping_range_with(occupied) if non_overlapping_range: # print("Retried and selecting: "+str(potential_block)) occupied.union_update(non_overlapping_range) real_blocks.append(Block(non_overlapping_range)) break except PartialOverlapException: # print("Retried and failed again") pass return real_blocks
def testBinarySanityCheckNotImplementedSubtle(self): """test RangeSet binary sanity check (NotImplemented subtle)""" rg1 = RangeSet("1-5") rg2 = "4-6" self.assertEqual(rg1.__and__(rg2), NotImplemented) self.assertEqual(rg1.__or__(rg2), NotImplemented) self.assertEqual(rg1.__sub__(rg2), NotImplemented) self.assertEqual(rg1.__xor__(rg2), NotImplemented) # Should implicitely raises TypeError if the real operator # version is invoked. To test that, we perform a manual check # as an additional function would be needed to check with # assertRaises(): good_error = False try: rg3 = rg1 & rg2 except TypeError: good_error = True self.assert_(good_error, "TypeError not raised for &") good_error = False try: rg3 = rg1 | rg2 except TypeError: good_error = True self.assert_(good_error, "TypeError not raised for |") good_error = False try: rg3 = rg1 - rg2 except TypeError: good_error = True self.assert_(good_error, "TypeError not raised for -") good_error = False try: rg3 = rg1 ^ rg2 except TypeError: good_error = True self.assert_(good_error, "TypeError not raised for ^")
def testSplit(self): """test RangeSet.split()""" # Empty rangeset rangeset = RangeSet() self.assertEqual(len(list(rangeset.split(2))), 0) # Not enough element rangeset = RangeSet("1") self.assertEqual((RangeSet("1"),), tuple(rangeset.split(2))) # Exact number of elements rangeset = RangeSet("1-6") self.assertEqual((RangeSet("1-2"), RangeSet("3-4"), RangeSet("5-6")), \ tuple(rangeset.split(3))) # Check limit results rangeset = RangeSet("0-3") for i in (4, 5): self.assertEqual((RangeSet("0"), RangeSet("1"), \ RangeSet("2"), RangeSet("3")), \ tuple(rangeset.split(i)))
def _scan_string_single(self, nsstr, autostep): """Single node scan, returns (pat, list of rangesets)""" # ignore whitespace(s) node = nsstr.strip() if len(node) == 0: raise NodeSetParseError(nsstr, "empty node name") # single node parsing pfx_nd = [mobj.groups() for mobj in self.base_node_re.finditer(node)] pfx_nd = pfx_nd[:-1] if not pfx_nd: raise NodeSetParseError(node, "parse error") # pfx+sfx cannot be empty if len(pfx_nd) == 1 and len(pfx_nd[0][0]) == 0: raise NodeSetParseError(node, "empty node name") pat = "" rangesets = [] for pfx, idx in pfx_nd: if idx: # optimization: process single index padding directly pad = 0 if int(idx) != 0: idxs = idx.lstrip("0") if len(idx) - len(idxs) > 0: pad = len(idx) idxint = int(idxs) else: if len(idx) > 1: pad = len(idx) idxint = 0 if idxint > 1e100: raise NodeSetParseRangeError( \ RangeSetParseError(idx, "invalid rangeset index")) # optimization: use numerical RangeSet constructor pat += "%s%%s" % pfx rangesets.append(RangeSet.fromone(idxint, pad, autostep)) else: # undefined pad means no node index pat += pfx return pat, rangesets
def testUpdate(self): """test RangeSet.update()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r2 = RangeSet("243-799,1924-1984") self.assertEqual(len(r2), 618) r1.update(r2) self.assertEqual(type(r1), RangeSet) self.assertEqual(r1.padding, None) self.assertEqual(len(r1), 240+618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984") r1 = RangeSet("1-100,102,105-242,800") r1.union_update(r2) self.assertEqual(len(r1), 240+618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
def testFolding(self): """test RangeSet folding conditions""" r1 = RangeSet("112,114-117,119,121,130,132,134,136,138,139-141,144,147-148", autostep=6) self.assertEqual(str(r1), "112,114-117,119,121,130,132,134,136,138-141,144,147-148") r1.autostep = 5 self.assertEqual(str(r1), "112,114-117,119,121,130-138/2,139-141,144,147-148") r1 = RangeSet("1,3-4,6,8") self.assertEqual(str(r1), "1,3-4,6,8") r1 = RangeSet("1,3-4,6,8", autostep=4) self.assertEqual(str(r1), "1,3-4,6,8") r1 = RangeSet("1,3-4,6,8", autostep=2) self.assertEqual(str(r1), "1,3,4-8/2") r1 = RangeSet("1,3-4,6,8", autostep=3) self.assertEqual(str(r1), "1,3,4-8/2") # empty set r1 = RangeSet(autostep=3) self.assertEqual(str(r1), "")
def testIsSubSet(self): """test RangeSet.issubset()""" r1 = RangeSet("1-100,102,105-242,800-900/2") r2 = RangeSet("3,800,802,804,888") self.assertTrue(r2.issubset(r2)) self.assertTrue(r2.issubset(r1)) self.assertTrue(r2 <= r1) self.assertTrue(r2 < r1) self.assertTrue(r1 > r2) self.assertFalse(r1 < r2) self.assertFalse(r1 <= r2) self.assertFalse(r2 >= r1) # since v1.6, padding is ignored when computing set operations r1 = RangeSet("1-100") r2 = RangeSet("001-100") self.assertTrue(r1.issubset(r2))
def testSlices(self): """test RangeSet.slices()""" r1 = RangeSet() self.assertEqual(len(r1), 0) self.assertEqual(len(list(r1.slices())), 0) # Without autostep r1 = RangeSet("1-7/2,8-12,3000-3019") self.assertEqual(r1.autostep, None) self.assertEqual(len(r1), 29) self.assertEqual(list(r1.slices()), [slice(1, 2, 1), slice(3, 4, 1), \ slice(5, 6, 1), slice(7, 13, 1), slice(3000, 3020, 1)]) # With autostep r1 = RangeSet("1-7/2,8-12,3000-3019", autostep=2) self.assertEqual(len(r1), 29) self.assertEqual(r1.autostep, 2) self.assertEqual(list(r1.slices()), [slice(1, 8, 2), slice(8, 13, 1), \ slice(3000, 3020, 1)])