コード例 #1
0
    def test_collision_same_words(self):
        """
        The words are all the same
        """

        sc = ScManager.get()

        sax_result = SaxResult(paa=sc.parallelize([]),
                               breakpoints=[],
                               sax_word='abcdabcdabcdabcd')
        sax, _, _ = sax_result.start_sax(4, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=6,
                                           index_selected=2,
                                           word_len=sequences_size,
                                           spark_ctx=sc)

        result = result.data

        # exactly the same words => six cells of maximum of combinations
        nb_cell = 0
        for i in result:
            if i[0] == 6:
                nb_cell += 1
        self.assertEqual(nb_cell, 6)
コード例 #2
0
    def test_coll_various_words(self):
        """
        Test the collision matrix for same and different words
        The words 0 and 3 are the same, the words 1 and 2 too
        """

        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(paa=sc.parallelize([]),
                               breakpoints=[],
                               sax_word=''.join(
                                   ['ababa', 'cdcdc', 'cdcdc', 'ababa']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)
        result = result.data
        result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]),
                                                    int(x[1][1])))
        print(result)
        # the maximum of possible combinations without repetitions is 10
        # two cells of 10 : one for the occurrences between the words 1 and 2, and another for the words 0 and 3
        for i in range(2):
            self.assertTrue(result[i][0] == 10)
        self.assertTrue(
            int(result[0][1][0]) == 2 and int(result[0][1][1]) == 1)
        self.assertTrue(
            int(result[1][1][0]) == 3 and int(result[1][1][1]) == 0)
コード例 #3
0
    def _apply_motif_iter_zero_coll(self, activate_spark):
        """
        Test
         - with the iterative method to search the neighborhood motif,
         - with/without spark jobs
         - and where the words are all different => no collisions
        """
        spark_context = ScManager.get()
        # Build the SAX result with different words, and small breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-0.3, -0.1, 0.1, 0.3],
                               sax_word='abcdebcdeacdeabdeabceabcd')
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(nb_seq)

        # Different words => only zero cells in the collision matrix
        collision_matrix = SparseMatrix(np.zeros((nb_seq, nb_seq)))

        # Build the class for motif search
        search_info = NeighborhoodSearch(size_sequence=20,
                                         mindist_lookup_table=breakpoint,
                                         alphabet_size=5,
                                         sax=np.transpose(sax),
                                         radius=1000,
                                         collision_matrix=collision_matrix)

        recognition_info = ConfigRecognition(
            is_stopped_by_eq9=True,
            iterations=100,
            min_value=1,
            is_algo_method_global=False,
            activate_spark=activate_spark,
            radius=1000,
            neighborhood_method=OPT_USING_BRUTE_FORCE)

        # neighborhood_method=OPT_USING_BRUTE_FORCE
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)

        # neighborhood_method=OPT_USING_COLLISIONS
        recognition_info.neighborhood_method = OPT_USING_COLLISIONS
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)
コード例 #4
0
    def test_coll_near_same_words(self):
        """
        The words have 1, or 2, or 3, or 4 occurrences, but there are not exactly the same because words have five
        letters
        """
        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(
            paa=sc.parallelize([]),
            breakpoints=[],
            sax_word=''.join(['aaaaa', 'abbbb', 'abccc', 'abcdd', 'abcde']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)

        # sorted result list
        result = result.data
        result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]),
                                                    int(x[1][1])))
        print(result)

        # sorted list expected:
        expected_result = [(1.0, (2, 1)), (1.0, (3, 1)), (3.0, (3, 2)),
                           (1.0, (4, 1)), (3.0, (4, 2)), (6.0, (4, 3))]
        expected_result.sort(key=lambda x: "{}-{}-{}".format(
            int(x[0]), int(x[1][0]), int(x[1][1])))

        self.assertEqual(len(result), len(expected_result))
        for expected_item, res_item in zip(expected_result, result):
            self.assertEqual(expected_item[0], res_item[0], 'nb collisions')
            self.assertEqual(expected_item[1][0], res_item[1][0],
                             'seq index left-side')
            self.assertEqual(expected_item[1][1], res_item[1][1],
                             'seq index right-side')
コード例 #5
0
    def test_collision_different_words(self):
        """
        The words are all different
        """
        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(
            paa=sc.parallelize([]),
            breakpoints=[],
            sax_word=''.join(['abcde', 'fghij', 'klmno', 'pqrst', 'uvwxy']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)
        result = result.data

        # different words => only zero cells in the matrix
        self.assertTrue(len(result) is 0)
コード例 #6
0
    def _apply_motif_global_same_words(self, activate_spark):
        """
        Test
        - with the global method to search the neighborhood motif,
        - with/without spark jobs according to activate_spark
        - and where the words are all the same
        """
        spark_context = ScManager.get()
        # Build the SAX result with large breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-300, -100, 100, 300],
                               sax_word='abcdeabcdeabcdeabcde')
        sax, _, _ = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(alphabet_size=5)

        # Build the collision matrix result
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
            ], [
                100,
                0,
                0,
                0,
            ], [
                100,
                100,
                0,
                0,
            ], [
                100,
                100,
                100,
                0,
            ]]))

        # two identical cases here: brute force / with collisions
        for method_opt in [OPT_USING_BRUTE_FORCE, OPT_USING_COLLISIONS]:
            #  mindist distances:
            #
            # [[ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]]

            # Build the class for motif search
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=0.01,
                                             collision_matrix=collision_matrix)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=1,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=0.01,
                neighborhood_method=method_opt)

            # neighborhood_method=OPT_USING_BRUTE_FORCE (compare with all the words)
            result = search_info.motif_neighborhood_global(
                30, recognition_info)

            self._print_mindist_mat(search_info)

            # The words corresponding to the six largest values cells have a MINDIST < radius
            self.assertEqual(len(result), 1)
            # This results are the same : [0,1,2,3]: the 6 groups have been reduced to one inside
            self.assertEqual(result, [[0, 1, 2, 3]])
コード例 #7
0
    def _apply_iter_coll_no_spark_ex1(self, activate_spark):
        """
         Tests motif_neighborhood_iterative()
         - the iterative method
         - using the heuristic based upon collisions
         - to search the neighborhood motif

         Note: test where the words have only one different letter.
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix
        # Note: this matrix is different from the one from
        #   test test_iterative__brute_no_spark_ex1:
        #    => see zeros are added: coll(3,2) == coll(4,2) == 0
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                40,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                0,
                0,
                0,
            ], [
                6,
                10,
                0,
                50,
                0,
            ]]))

        self._print_matrix("test_iterative__brute_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_BRUTE_FORCE
        #
        # iterative:  examining collisions (i,j) per iteration:
        #             (3,4) then (1,2) +(0,1)
        #
        #             (collisions greater than min_value==25)
        #
        # Test with fixed radius 1.9:
        #    - iter=1    => result is [[3, 4]] considering (S3,S4) neighborhood
        #    - iter=2    => result extended with [0,1,2] considering (S0,S1), unchanged for (S1,S2)
        #    - iter=3    => result is the same than for iter=2: no more collision available
        #    - iter=100  => result is the same than for iter=2: no more collision available
        #
        for radius, nb_iter, expected_res in [[1.9, 1, [[3, 4]]],
                                              [1.9, 2, [[3, 4], [0, 1, 2]]],
                                              [1.9, 3, [[3, 4], [0, 1, 2]]],
                                              [1.9, 100, [[3, 4], [0, 1, 2]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=nb_iter,
                min_value=25,
                is_algo_method_global=False,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            result = search_info.motif_neighborhood_iterative(
                recognition_info.min_value, recognition_info)

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)
コード例 #8
0
    def _apply_motif_global_coll_ex1(self, activate_spark):
        """
        Test
          - with the global method to search the neighborhood motif,
          - with/without spark according to activate_spark
          - exploring similarities with collisions heuristic
          - with input: the words have only one different letter.  And every sequence
            Si has collisions with Sj with that matrix.

         Note: results ought to be equal to test_global_brute_no_spark_ex1
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix (the real collision matrix is different, but we take this one for the test)
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                30,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                50,
                0,
                0,
            ], [
                6,
                10,
                20,
                60,
                0,
            ]]))

        self._print_matrix("test_global_coll_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_COLLISIONS
        #
        #  for collisions (0,1) (1,2) (2,3) (3,4) greater than min_value==25
        #  and with the collisions heuristic: only sequences having collisions with Si or Sj are examined
        #
        # for radius 1.9  => global result is [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]]
        #
        # for radius 2.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 3.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 6    => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4]]
        #
        for radius, expected_res in [[2.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [
                                         1.9,
                                         [[0, 1, 2], [0, 1, 2, 3, 4],
                                          [1, 2, 3, 4], [2, 3, 4]]
                                     ], [3.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [6, [[0, 1, 2, 3, 4]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=25,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            print("radius {}:expected:                 {}".format(
                radius, expected_res))
            result = search_info.motif_neighborhood_global(
                recognition_info.min_value, recognition_info)

            print("radius {}:->global with collisions: {}".format(
                radius, result))

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)