def multiple_search(self, query): if not isinstance(query, str): raise ValueError if not query or self.database is None: return {} tokenizer = Tokenizer() """ tokenisation of query, create list of tokens """ searchlist = [] for token in tokenizer.tokenize_generator_type(query): if token.t == 'A' or token.t == 'D': searchlist.append(token.s) results_of_search = [] # search each token from query for token in searchlist: results_of_search.append(set(self.search(token))) list_of_files = results_of_search[ 0] # find files with all words from query for f in results_of_search: list_of_files = list_of_files & f final_dict = { } # create a dictionary of positions of all query tokens in files for f in list_of_files: final_dict[f] = [] for token in searchlist: final_dict[f].extend(self.database[token][f]) final_dict[f].sort() return final_dict
def find_window(self, findstr, window_len=3): """ Search database and return files and positions for the searched word """ if not isinstance(findstr, str): raise ValueError if not findstr: return {} windows = {} tokenizer = Tokenizer() result_dict = self.multiple_search(findstr) for file_key in result_dict: wins = [] result_list = result_dict[file_key] for result_position in result_list: with open(file_key) as f: for i, line in enumerate(f): if i == result_position.string: break line = line.strip("\n") right_context = line[result_position.start:] left_context = line[:result_position.end][::-1] for i, token in enumerate( tokenizer.generate_type_AD(left_context)): if i == window_len: break start = result_position.end - token.position - len(token.s) for i, token in enumerate( tokenizer.generate_type_AD(right_context)): if i == window_len: break end = result_position.start + token.position + len(token.s) win = TokenWindow(line, [result_position], start, end) # create new window win = self.supplemented_window( win ) # expanding the window to the borders of the proposals wins.append(win) # addind window to dictionary wins = self.join_windows({file_key: wins })[file_key] # connection of Windows if len(wins) > 0: windows[file_key] = wins return windows
def prescribe_index(self, path): if not isinstance(path, str): raise ValueError( 'Input has an unappropriate type,it should be str') tokenizer = Tokenizer() f = open(path, 'r') for i, string in enumerate(f): tokens = tokenizer.tokenize_generator_type(string) for token in tokens: if token.t == 'A' or token.t == 'D': self.db.setdefault(token.s, {}).setdefault(path, []).append( Position.from_token(token, i)) f.close()
def multiple_search_lim_gen(self, query, offset, limit): # with the limits for files if offset < 0: offset = 0 if not isinstance(query, str): raise ValueError if not query or self.database is None: return {} tokenizer = Tokenizer() """ tokenisation of query, create list of tokens """ searchlist = [] for token in tokenizer.tokenize_generator_type(query): if token.t == 'A' or token.t == 'D': searchlist.append(token.s) results_of_search = [] # search each token from query for token in searchlist: results_of_search.append(set(self.search(token))) list_of_files = results_of_search[ 0] # find files with all words from query for f in results_of_search: list_of_files = list_of_files & f final_dict = { } # create a dictionary of positions of all query tokens in files list_of_files = sorted(list_of_files) for i, f in enumerate(list_of_files): if i >= offset + limit: break if i < offset: continue lists = [] for token in searchlist: lists.append(self.database[token][f]) final_dict[f] = self.merge_and_sort_lists(lists) return final_dict
def context_window_generator(self, file_name, contexts, window_len=3): """ Generator context window with window_len """ tokenizer = Tokenizer() for result_position in contexts: """ Find line for position """ with open(file_name) as f: for i, line in enumerate(f): if i == result_position.string: break line = line.strip("\n") right_context = line[result_position.start:] left_context = line[:result_position.end][::-1] """ Expanding the boundaries of the window according to the specified parameter Calculating of the beginning """ for i, token in enumerate( tokenizer.generate_type_AD(left_context)): if i == window_len: break start = result_position.end - token.position - len(token.s) """ Expanding the boundaries of the window according to the specified parameter Calculating the end """ for i, token in enumerate( tokenizer.generate_type_AD(right_context)): if i == window_len: break end = result_position.start + token.position + len(token.s) win = TokenWindow(line, [result_position], start, end) # create new window win = self.supplemented_window( win) # expande the window to the borders of the proposals yield win
def find_window_lim_v2(self, findstr, window_len=3, offset=0, limit=0, winLimits=None): """ Search database and return files and positions for the searched word witch limits and limits for file transfer in multiple_search """ if not isinstance(findstr, str): raise ValueError if not findstr: return {} windows = {} tokenizer = Tokenizer() # simply find # result_dict = self.multiple_search_lim(findstr, offset, limit) # find with generators result_dict = self.multiple_search_lim_gen(findstr, offset, limit) for f, file_key in enumerate(result_dict.keys()): wins = [] result_list = result_dict[file_key] st = 0 en = 5 if winLimits is not None: st = winLimits[f][0] # offset for current tom en = st + winLimits[f][1] # offset + limit for current tom if st < 0: st = 0 for result_position in result_list: with open(file_key) as f: for i, line in enumerate(f): if i == result_position.string: break line = line.strip("\n") right_context = line[result_position.start:] left_context = line[:result_position.end][::-1] for i, token in enumerate( tokenizer.generate_type_AD(left_context)): if i == window_len: break start = result_position.end - token.position - len(token.s) for i, token in enumerate( tokenizer.generate_type_AD(right_context)): if i == window_len: break end = result_position.start + token.position + len(token.s) win = TokenWindow(line, [result_position], start, end) # create new window win = self.supplemented_window( win ) # expanding the window to the borders of the proposals wins.append(win) # addind window to dictionary wins = self.join_windows({file_key: wins })[file_key] # connection of Windows if len(wins) == en: break # stop when the required number of Windows is found if len(wins) > 0: windows[file_key] = wins[ st:] # return the Windows from the required position (offset) else: windows[file_key] = [] return windows
def find_window_lim(self, findstr, window_len=3, offset=0, limit=0, winLimits=None): """ Search database and return files and positions for the searched word witch limits and limits for file defined this function """ if not isinstance(findstr, str): raise ValueError if not findstr: return {} windows = {} tokenizer = Tokenizer() result_dict = self.multiple_search(findstr) for f, file_key in enumerate(result_dict.keys()): wins = [] if f >= offset + limit: break if f < offset: continue result_list = result_dict[file_key] if winLimits is not None: st = int(winLimits[f - offset][0]) en = st + int(winLimits[f - offset][1]) if len(result_list) < en: en = len(result_list) result_list = result_list[st:en] for result_position in result_list: with open(file_key) as f: for i, line in enumerate(f): if i == result_position.string: break line = line.strip("\n") right_context = line[result_position.start:] left_context = line[:result_position.end][::-1] for i, token in enumerate( tokenizer.generate_type_AD(left_context)): if i == window_len: break start = result_position.end - token.position - len(token.s) for i, token in enumerate( tokenizer.generate_type_AD(right_context)): if i == window_len: break end = result_position.start + token.position + len(token.s) win = TokenWindow(line, [result_position], start, end) # create new window win = self.supplemented_window( win ) # expanding the window to the borders of the proposals wins.append(win) # addind window to dictionary wins = self.join_windows({file_key: wins })[file_key] # connection of Windows if len(wins) > 0: windows[file_key] = wins else: windows[file_key] = [] return windows
def setUp(self): self.x = Tokenizer()
class TestMyCode(unittest.TestCase): # making a unit of Tokeniser class def setUp(self): self.x = Tokenizer() # the test itself def test_mygenerator_type(self): result = self.x.tokenize_generator_type(' h50 ht ? 20 h d sun') self.assertIsInstance(result, Generator) def test_type(self): result = list(self.x.tokenize_generator_type(' h50 ht ? 20 h d sun')) print(result) self.assertEqual(len(result), 15) self.assertEqual(result[0].s, ' ') self.assertEqual(result[0].t, 'S') self.assertEqual(result[0].position, 0) self.assertEqual(result[14].s, 'sun') self.assertEqual(result[14].t, 'A') self.assertEqual(result[14].position, 17) def test_MyError_type_number(self): with self.assertRaises(ValueError): list(self.x.tokenize_generator_type(12)) def test_MyError_type_notList(self): s = [1, 2, 3, 'this is my string'] with self.assertRaises(ValueError): list(self.x.tokenize_generator_type(s)) def test_mygenerator(self): result = self.x.tokenize_generator(' h50 ht ? 20 h d sun') self.assertIsInstance(result, Generator) def test_my_gen_begins_with_no_alpha(self): result = list(self.x.tokenize_generator(' h50 ht ? 20 h d sun')) self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 1) self.assertEqual(result[1].s, 'ht') self.assertEqual(result[1].position, 5) def test_my_gen_begins_with_alpha(self): result = list(self.x.tokenize_generator('h50 ht ? 20 h d sun')) self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 0) self.assertEqual(result[1].s, 'ht') self.assertEqual(result[1].position, 4) def test_my_gen_ends_with_no_alpha(self): result = list(self.x.tokenize_generator('h50 ht ? 20 h d sun')) self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].s, 'sun') self.assertEqual(result[4].position, 16) def test_my_gen_ends_with_alpha(self): result = list(self.x.tokenize_generator('h50 ht ? 20 h d sun')) self.assertIsInstance(result, list) self.assertEqual(len(result), 6) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].s, 'sun') self.assertEqual(result[4].position, 16) def test_MyError_number_gen(self): with self.assertRaises(ValueError): list(self.x.tokenize_generator(12)) def test_MyError_notList_gen(self): s = [1, 2, 3, 'my name is Anya'] with self.assertRaises(ValueError): list(self.x.tokenize_generator(s)) def test_MyError_emptyString(self): result = self.x.tokenize('') self.assertIsInstance(result, list) self.assertEqual(len(result), 0) def test_begins_with_no_alpha(self): result = self.x.tokenize(' h50 ht ? 20 h d sun') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 1) self.assertEqual(result[1].s, 'ht') self.assertEqual(result[1].position, 5) def test_begins_with_alpha(self): result = self.x.tokenize('h50 ht ? 20 h d sun') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 0) self.assertEqual(result[1].s, 'ht') self.assertEqual(result[1].position, 4) def test_ends_with_no_alpha(self): result = self.x.tokenize(' h50 ht ? 20 h d sun') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 1) self.assertEqual(result[4].s, 'sun') self.assertEqual(result[4].position, 17) def test_ends_with_alpha(self): result = self.x.tokenize(' h50 ht ? 20 h d sun') self.assertIsInstance(result, list) self.assertEqual(len(result), 5) self.assertEqual(result[0].s, 'h') self.assertEqual(result[0].position, 1) self.assertEqual(result[4].s, 'sun') self.assertEqual(result[4].position, 17) def test_MyError_number(self): with self.assertRaises(ValueError): self.x.tokenize(12) def test_MyError_notList(self): s = [1, 2, 3, 'my name is Anya'] with self.assertRaises(ValueError): self.x.tokenize(s) def test_MyError_emptyString(self): result = self.x.tokenize('') self.assertIsInstance(result, list) self.assertEqual(len(result), 0)