Beispiel #1
0
    def multiple_search(self, query):
        if not isinstance(query, str):
            raise ValueError
        if not query or self.database is None:
            return {}

        tokenizer = Tokenizer()
        """
        tokenisation of query, create list of tokens
        """
        searchlist = []
        for token in tokenizer.tokenize_generator_type(query):
            if token.t == 'A' or token.t == 'D':
                searchlist.append(token.s)
        results_of_search = []  # search each token from query
        for token in searchlist:
            results_of_search.append(set(self.search(token)))
        list_of_files = results_of_search[
            0]  # find files with all words from query
        for f in results_of_search:
            list_of_files = list_of_files & f
        final_dict = {
        }  # create a dictionary of positions of all query tokens in files
        for f in list_of_files:
            final_dict[f] = []
            for token in searchlist:
                final_dict[f].extend(self.database[token][f])
            final_dict[f].sort()
        return final_dict
Beispiel #2
0
    def find_window(self, findstr, window_len=3):
        """
        Search database and return files
        and positions for the searched word
        """

        if not isinstance(findstr, str):
            raise ValueError
        if not findstr:
            return {}

        windows = {}
        tokenizer = Tokenizer()
        result_dict = self.multiple_search(findstr)

        for file_key in result_dict:
            wins = []
            result_list = result_dict[file_key]

            for result_position in result_list:

                with open(file_key) as f:
                    for i, line in enumerate(f):
                        if i == result_position.string:
                            break
                line = line.strip("\n")

                right_context = line[result_position.start:]
                left_context = line[:result_position.end][::-1]

                for i, token in enumerate(
                        tokenizer.generate_type_AD(left_context)):
                    if i == window_len:
                        break
                start = result_position.end - token.position - len(token.s)

                for i, token in enumerate(
                        tokenizer.generate_type_AD(right_context)):
                    if i == window_len:
                        break
                end = result_position.start + token.position + len(token.s)

                win = TokenWindow(line, [result_position], start,
                                  end)  # create new window
                win = self.supplemented_window(
                    win
                )  # expanding the window to the borders of the proposals
                wins.append(win)  # addind window to dictionary
                wins = self.join_windows({file_key: wins
                                          })[file_key]  # connection of Windows

            if len(wins) > 0:
                windows[file_key] = wins

        return windows
Beispiel #3
0
    def prescribe_index(self, path):
        if not isinstance(path, str):
            raise ValueError(
                'Input has an unappropriate type,it should be str')

        tokenizer = Tokenizer()
        f = open(path, 'r')
        for i, string in enumerate(f):
            tokens = tokenizer.tokenize_generator_type(string)
            for token in tokens:
                if token.t == 'A' or token.t == 'D':
                    self.db.setdefault(token.s,
                                       {}).setdefault(path, []).append(
                                           Position.from_token(token, i))
        f.close()
Beispiel #4
0
    def multiple_search_lim_gen(self, query, offset,
                                limit):  # with the limits for files

        if offset < 0:
            offset = 0

        if not isinstance(query, str):
            raise ValueError
        if not query or self.database is None:
            return {}

        tokenizer = Tokenizer()
        """
        tokenisation of query, create list of tokens
        """
        searchlist = []
        for token in tokenizer.tokenize_generator_type(query):
            if token.t == 'A' or token.t == 'D':
                searchlist.append(token.s)

        results_of_search = []  # search each token from query
        for token in searchlist:
            results_of_search.append(set(self.search(token)))

        list_of_files = results_of_search[
            0]  # find files with all words from query
        for f in results_of_search:
            list_of_files = list_of_files & f

        final_dict = {
        }  # create a dictionary of positions of all query tokens in files
        list_of_files = sorted(list_of_files)
        for i, f in enumerate(list_of_files):

            if i >= offset + limit:
                break

            if i < offset:
                continue

            lists = []
            for token in searchlist:
                lists.append(self.database[token][f])

            final_dict[f] = self.merge_and_sort_lists(lists)

        return final_dict
Beispiel #5
0
    def context_window_generator(self, file_name, contexts, window_len=3):
        """
        Generator context window with window_len  
        """
        tokenizer = Tokenizer()
        for result_position in contexts:
            """
            Find line for position
            """
            with open(file_name) as f:
                for i, line in enumerate(f):
                    if i == result_position.string:
                        break
            line = line.strip("\n")

            right_context = line[result_position.start:]
            left_context = line[:result_position.end][::-1]
            """
            Expanding the boundaries of the window according to the specified parameter
            Calculating of the beginning
            """
            for i, token in enumerate(
                    tokenizer.generate_type_AD(left_context)):
                if i == window_len:
                    break
            start = result_position.end - token.position - len(token.s)
            """
            Expanding the boundaries of the window according to the specified parameter
            Calculating the end
            """
            for i, token in enumerate(
                    tokenizer.generate_type_AD(right_context)):
                if i == window_len:
                    break
            end = result_position.start + token.position + len(token.s)

            win = TokenWindow(line, [result_position], start,
                              end)  # create new window
            win = self.supplemented_window(
                win)  # expande the window to the borders of the proposals

            yield win
Beispiel #6
0
    def find_window_lim_v2(self,
                           findstr,
                           window_len=3,
                           offset=0,
                           limit=0,
                           winLimits=None):
        """
        Search database and return files
        and positions for the searched word
        witch limits and limits for file transfer in multiple_search 
        """

        if not isinstance(findstr, str):
            raise ValueError
        if not findstr:
            return {}

        windows = {}
        tokenizer = Tokenizer()

        # simply find
        # result_dict = self.multiple_search_lim(findstr, offset, limit)

        # find with generators
        result_dict = self.multiple_search_lim_gen(findstr, offset, limit)

        for f, file_key in enumerate(result_dict.keys()):
            wins = []

            result_list = result_dict[file_key]

            st = 0
            en = 5

            if winLimits is not None:
                st = winLimits[f][0]  # offset for current tom
                en = st + winLimits[f][1]  # offset + limit for current tom

                if st < 0:
                    st = 0

            for result_position in result_list:

                with open(file_key) as f:
                    for i, line in enumerate(f):
                        if i == result_position.string:
                            break
                line = line.strip("\n")

                right_context = line[result_position.start:]
                left_context = line[:result_position.end][::-1]

                for i, token in enumerate(
                        tokenizer.generate_type_AD(left_context)):
                    if i == window_len:
                        break
                start = result_position.end - token.position - len(token.s)

                for i, token in enumerate(
                        tokenizer.generate_type_AD(right_context)):
                    if i == window_len:
                        break
                end = result_position.start + token.position + len(token.s)

                win = TokenWindow(line, [result_position], start,
                                  end)  # create new window
                win = self.supplemented_window(
                    win
                )  # expanding the window to the borders of the proposals
                wins.append(win)  # addind window to dictionary
                wins = self.join_windows({file_key: wins
                                          })[file_key]  # connection of Windows

                if len(wins) == en:
                    break  # stop when the required number of Windows is found

            if len(wins) > 0:
                windows[file_key] = wins[
                    st:]  # return the Windows from the required position (offset)
            else:
                windows[file_key] = []

        return windows
Beispiel #7
0
    def find_window_lim(self,
                        findstr,
                        window_len=3,
                        offset=0,
                        limit=0,
                        winLimits=None):
        """
        Search database and return files
        and positions for the searched word
        witch limits and limits for file defined this function
        """

        if not isinstance(findstr, str):
            raise ValueError
        if not findstr:
            return {}

        windows = {}
        tokenizer = Tokenizer()
        result_dict = self.multiple_search(findstr)

        for f, file_key in enumerate(result_dict.keys()):
            wins = []
            if f >= offset + limit:
                break

            if f < offset:
                continue

            result_list = result_dict[file_key]

            if winLimits is not None:
                st = int(winLimits[f - offset][0])
                en = st + int(winLimits[f - offset][1])

                if len(result_list) < en:
                    en = len(result_list)

                result_list = result_list[st:en]

            for result_position in result_list:

                with open(file_key) as f:
                    for i, line in enumerate(f):
                        if i == result_position.string:
                            break
                line = line.strip("\n")

                right_context = line[result_position.start:]
                left_context = line[:result_position.end][::-1]

                for i, token in enumerate(
                        tokenizer.generate_type_AD(left_context)):
                    if i == window_len:
                        break
                start = result_position.end - token.position - len(token.s)

                for i, token in enumerate(
                        tokenizer.generate_type_AD(right_context)):
                    if i == window_len:
                        break
                end = result_position.start + token.position + len(token.s)

                win = TokenWindow(line, [result_position], start,
                                  end)  # create new window
                win = self.supplemented_window(
                    win
                )  # expanding the window to the borders of the proposals
                wins.append(win)  # addind window to dictionary
                wins = self.join_windows({file_key: wins
                                          })[file_key]  # connection of Windows

            if len(wins) > 0:
                windows[file_key] = wins
            else:
                windows[file_key] = []

        return windows
Beispiel #8
0
 def setUp(self):
     self.x = Tokenizer()
Beispiel #9
0
class TestMyCode(unittest.TestCase):
    # making a unit of Tokeniser class

    def setUp(self):
        self.x = Tokenizer()

    # the test itself

    def test_mygenerator_type(self):
        result = self.x.tokenize_generator_type(' h50 ht ? 20 h d sun')
        self.assertIsInstance(result, Generator)

    def test_type(self):
        result = list(self.x.tokenize_generator_type(' h50 ht ? 20 h d sun'))
        print(result)
        self.assertEqual(len(result), 15)
        self.assertEqual(result[0].s, ' ')
        self.assertEqual(result[0].t, 'S')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[14].s, 'sun')
        self.assertEqual(result[14].t, 'A')
        self.assertEqual(result[14].position, 17)

    def test_MyError_type_number(self):
        with self.assertRaises(ValueError):
            list(self.x.tokenize_generator_type(12))

    def test_MyError_type_notList(self):
        s = [1, 2, 3, 'this is my string']
        with self.assertRaises(ValueError):
            list(self.x.tokenize_generator_type(s))

    def test_mygenerator(self):
        result = self.x.tokenize_generator(' h50 ht ? 20 h d sun')
        self.assertIsInstance(result, Generator)

    def test_my_gen_begins_with_no_alpha(self):
        result = list(self.x.tokenize_generator(' h50 ht ? 20 h d sun'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[1].s, 'ht')
        self.assertEqual(result[1].position, 5)

    def test_my_gen_begins_with_alpha(self):
        result = list(self.x.tokenize_generator('h50 ht ? 20 h d sun'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[1].s, 'ht')
        self.assertEqual(result[1].position, 4)

    def test_my_gen_ends_with_no_alpha(self):
        result = list(self.x.tokenize_generator('h50 ht ? 20 h d sun'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[4].s, 'sun')
        self.assertEqual(result[4].position, 16)

    def test_my_gen_ends_with_alpha(self):
        result = list(self.x.tokenize_generator('h50 ht ? 20 h d sun'))
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 6)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[4].s, 'sun')
        self.assertEqual(result[4].position, 16)

    def test_MyError_number_gen(self):
        with self.assertRaises(ValueError):
            list(self.x.tokenize_generator(12))

    def test_MyError_notList_gen(self):
        s = [1, 2, 3, 'my name is Anya']
        with self.assertRaises(ValueError):
            list(self.x.tokenize_generator(s))

    def test_MyError_emptyString(self):
        result = self.x.tokenize('')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 0)

    def test_begins_with_no_alpha(self):
        result = self.x.tokenize(' h50 ht ? 20 h d sun')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[1].s, 'ht')
        self.assertEqual(result[1].position, 5)

    def test_begins_with_alpha(self):
        result = self.x.tokenize('h50 ht ? 20 h d sun')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 0)
        self.assertEqual(result[1].s, 'ht')
        self.assertEqual(result[1].position, 4)

    def test_ends_with_no_alpha(self):
        result = self.x.tokenize(' h50 ht ? 20 h d sun')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[4].s, 'sun')
        self.assertEqual(result[4].position, 17)

    def test_ends_with_alpha(self):
        result = self.x.tokenize(' h50 ht ? 20 h d sun')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].s, 'h')
        self.assertEqual(result[0].position, 1)
        self.assertEqual(result[4].s, 'sun')
        self.assertEqual(result[4].position, 17)

    def test_MyError_number(self):
        with self.assertRaises(ValueError):
            self.x.tokenize(12)

    def test_MyError_notList(self):
        s = [1, 2, 3, 'my name is Anya']
        with self.assertRaises(ValueError):
            self.x.tokenize(s)

    def test_MyError_emptyString(self):
        result = self.x.tokenize('')
        self.assertIsInstance(result, list)
        self.assertEqual(len(result), 0)