def search_mult(self, query, limit, offset): """Multiword search. :return: a dictionary with the file names of the files that contain all words of the query as the keys and all Positions in that file of the words of the query as the values. :param db: database containing file(s) query: input query """ self.query = query t = Tokenizer() res = [] # list for dictionaries of search results fs = [] # list for sets of names of files output = {} dic = self.db for i in t.alph_tokenize(query): #print(i) if not dic.get(i.tok) in res: res.append(dic.get(i.tok)) # create list of sets of filenames for each word for f in res: fs.append(set(f.keys())) for r in sorted( list(set.intersection(*fs)) )[offset:offset + limit]: # get files that contain all the words of the query for item in res: output.setdefault(r, []).append(item[r]) # sort positions by line and start index for el in output: output[el] = our_sort(output[el]) return output
def test_num(self): """ Non-alphabetical tokens """ t = Tokenizer() res = list(t.i_tokenize("token123toj, ")) self.assertEqual(len(res), 5) self.assertIsInstance(res[0], Token) self.assertIsInstance(res[1], Token) self.assertIsInstance(res[2], Token) self.assertIsInstance(res[3], Token) self.assertIsInstance(res[4], Token) self.assertEqual(res[0].tok, "token") self.assertEqual(res[1].tok, "123") self.assertEqual(res[2].tok, "toj") self.assertEqual(res[3].tok, ",") self.assertEqual(res[4].tok, " ") self.assertEqual(res[0].f_ch, 0) self.assertEqual(res[1].f_ch, 5) self.assertEqual(res[2].f_ch, 8) self.assertEqual(res[3].f_ch, 11) self.assertEqual(res[4].f_ch, 12) self.assertEqual(res[0].l_ch, 4) self.assertEqual(res[1].l_ch, 7) self.assertEqual(res[2].l_ch, 10) self.assertEqual(res[3].l_ch, 11) self.assertEqual(res[4].l_ch, 12) self.assertEqual(res[0].typ, "alph") self.assertEqual(res[1].typ, "digit") self.assertEqual(res[2].typ, "alph") self.assertEqual(res[3].typ, "punct") self.assertEqual(res[4].typ, "space")
def test_singletoken(self): """ String with one token """ t = Tokenizer() res = list(t.i_tokenize("мама")) self.assertEqual(len(res), 1) self.assertIsInstance(res[0], Token) self.assertEqual(res[0].tok, "мама") self.assertEqual(res[0].f_ch, 0) self.assertEqual(res[0].l_ch, 3) self.assertEqual(res[0].typ, "alph")
def indexate(self, s): """Indexate a string. :return: dictionary of each unique Token of a string and a list of its Positions :param s: indexated string """ self.s = s d = {} t = Tokenizer() res = t.i_tokenize(s) for i in res: # include only alpha and digit tokens if i.typ == "alph" or i.typ == "digit": d.setdefault(i.tok, []).append(Position(i.f_ch, i.l_ch)) return d
def lfile_indexate(self, s): """Indexate a string from a file. :return: dictionary of each unique Token from a file and a list of Position_d for that Token :param s: path of the file """ self.s = s d = {} t = Tokenizer() with open(s, 'r', encoding="utf-8") as f: for i, line in enumerate(f): for it in t.i_tokenize(line.lower()): # include only alpha and digit tokens if it.typ == "alph" or it.typ == "digit": d.setdefault(it.tok, {}).setdefault(s, []).append( Position_d(it.f_ch, it.l_ch, i)) return d
def test_lastalph(self): """ String that ends with alphabetical symbol """ t = Tokenizer() res = list(t.i_tokenize("то есть")) self.assertEqual(len(res), 3) self.assertIsInstance(res[0], Token) self.assertIsInstance(res[1], Token) self.assertIsInstance(res[2], Token) self.assertEqual(res[0].tok, "то") self.assertEqual(res[1].tok, " ") self.assertEqual(res[2].tok, "есть") self.assertEqual(res[0].f_ch, 0) self.assertEqual(res[1].f_ch, 2) self.assertEqual(res[2].f_ch, 3) self.assertEqual(res[0].l_ch, 1) self.assertEqual(res[1].l_ch, 2) self.assertEqual(res[2].l_ch, 6) self.assertEqual(res[0].typ, "alph") self.assertEqual(res[1].typ, "space") self.assertEqual(res[2].typ, "alph")
def search_mult_stem(self, query, limit, offset): """Multiword search with stemming. :return: a dictionary with the file names of the files that contain all stems/lemmas of the query words as the keys and a generator of all Positions query words stems/lemmas in that file. :param query: input query limit: number of files offset: index of the first file (starting at 1) """ t = Tokenizer() stemmer = Stemmer_agent() res = [] # list for dictionaries of search results fs = [] # list for sets of names of files output = {} dic = self.db for i in t.alph_tokenize(query): #print(i) stems = {} for st in stemmer.stem(i.tok): if st in dic: #print(st) for fn in dic.get(st).keys(): stems.setdefault(fn, []).extend(dic.get(st)[fn]) res.append(stems) #for f in res: # print(f) # create list of sets of filenames for each word for f in res: fs.append(set(f.keys())) for r in sorted( list(set.intersection(*fs)) )[offset:offset + limit]: # get files that contain all the words of the query for item in res: output.setdefault(r, []).append(item[r]) # sort positions by line and start index for el in output: output[el] = our_sort(output[el]) return output
def db_file_indexate(self, name, s): """Indexate file and create a database. :return: database containing a dictionary of Tokens and a dictionary of filenames where the Token was found and a list of Positions_d objects :param name: name of a file of a shelve database s: path of a indexated file """ self.name = name self.s = s t = Tokenizer() db = shelve.open(name, 'c', writeback=True) with open(s, 'r', encoding="utf-8") as f: for i, line in enumerate(f): for it in t.i_tokenize(line): # include only alpha and digit tokens if it.typ == "alph" or it.typ == "digit": x = db.setdefault(it.tok, {}) x.setdefault(s, []).append( Position_d(it.f_ch, it.l_ch, i)) db.close()
def stem_indexate(self, name, s): """Indexate file and create a database. :return: database containing a dictionary of lemmas/stems and a dictionary of filenames where the lemma/stem were found and a list of Positions_d objects :param name: name of a file of a shelve database s: path of a indexated file """ t = Tokenizer() stemmer = Stemmer_agent() db = shelve.open(name, 'c', writeback=True) with open(s, 'r', encoding='utf-8') as f: for i, line in enumerate(f): for it in t.i_tokenize(line): if it.typ == "alph" or it.typ == "digit": # get the stem or lemma of the token for st in stemmer.stem(it.tok): x = db.setdefault(st, {}) x.setdefault(s, []).append( Position_d(it.f_ch, it.l_ch, i)) db.close()
def __init__(self, pos, line, ext): self.pos = pos self.line = line self.ext = ext # tokenize the line ws = list(Tokenizer().alph_tokenize(line)) for n, word in enumerate(ws): # set right border of the context window if word.f_ch == pos[0].start: if n - ext >= 0: self.left = ws[n - ext].f_ch else: self.left = 0 # set right border of the context window if word.l_ch == pos[-1].end: try: self.right = ws[n + ext].l_ch except IndexError: self.right = len(line) - 1
def test_ex(self): """ Not a string object """ t = Tokenizer() with self.assertRaises(TypeError): list(t.i_tokenize(["j"]))
def test_empty(self): """ Empty string """ t = Tokenizer() res = list(t.i_tokenize("")) self.assertEqual(len(res), 0)
def test_symbol(self): t = Tokenizer() res = list(t.alph_tokenize('b')) gold = [Token('b', 0, 0, "alph")] self.assertEqual(res, gold)
def do_POST(self): form = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={ 'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': 'text/html; charset=utf-8', }) self.send_response(200) self.send_header('Content-type', 'text/html; charset=utf-8') self.end_headers() query = form.getfirst("QUERY", "") query = html.escape(query) limit = form.getfirst("LIMIT", "2") # document limit limit = int(html.escape(limit)) offset = form.getfirst("OFFSET", "1") # document offset offset = int(html.escape(offset)) mass0 = form.getfirst("OFF", "1") # initial citat offset mass0 = int(html.escape(mass0)) mass1 = form.getfirst("LIM", "10") # initial citat limit mass1 = int(html.escape(mass1)) cit_lim = form.getlist("NEW_LIM") cit_off = form.getlist("NEW_OFF") # button values button_prev = form.getlist("PREV_PAGE") button_next = form.getlist("NEXT_PAGE") doc_prev = form.getlist("PREV_DOCS") doc_next = form.getlist("NEXT_DOCS") reset = form.getlist("RESET") print(doc_prev, doc_next) mass = [] if not button_prev: if not button_next: if not cit_lim: for i in range(limit): mass.append( (mass0 - 1, mass1 + 1)) # default array of (offset, limit) pairs else: for i in range(limit): mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1)) else: for i in range(limit): #print(i, button_next[0]) if not i == int(button_next[0]): mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1)) else: #print(mass0, mass1) mass.append((int(cit_off[i]) - 1 + int(cit_lim[i]), int(cit_lim[i]) + 1)) else: for i in range(limit): if not i == int(button_prev[0]): mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1)) else: mass.append((int(cit_off[i]) - 1 - int(cit_lim[i]), int(cit_lim[i]) + 1)) mass.append((0, 0)) print(mass) if doc_prev: offset = offset - limit if doc_next: offset = offset + limit se = self.server.se width = int(self.server.config["Default"]["context w width"]) t = Tokenizer().alph_tokenize(query) if len(list(t)) > 1: r = se.search_mult_stem(query, limit + 1, offset - 1) else: r = se.search_stem(query, limit + 1, offset - 1) cws = se.get_context_w(r, width) comb = se.combine_cw(cws) res = se.ultimate_out(comb, mass) print(res) if reset: out = "" out += "<html><body>" out += "<form method=\"POST\" action=\"\">" out += "<p> What do you want to search? " out += "<input type=\"text\" name=\"QUERY\"><input type=\"submit\"></p>" out += "<p> How many documents do you want to see? " out += "<input type=\"text\" name=\"LIMIT\"></p>" out += "<p> Starting with? " out += "<input type=\"text\" name=\"OFFSET\"></p>" out += "<p> How many citations do you want to see? " out += "<input type=\"text\" name=\"LIM\"></p>" out += "<p> Starting with? " out += "<input type=\"text\" name=\"OFF\"></p>" out += "</form></body></html>" else: out = "" out += "<!DOCTYPE HTML><html><body>" out += "<form method=\"POST\" action=\"\">" out += "<p> What do you want to search? " out += "<input type=\"text\" name=\"QUERY\" value=\"{}\"><input type=\"submit\"></p>".format( query) #out += "<p> How many documents do you want to see? " out += "<input type=\"text\" name=\"LIMIT\" hidden=\"true\" value=\"{}\"></p>".format( limit) #out += "<p> Starting with? " out += "<p><input type=\"text\" name=\"OFFSET\" hidden=\"true\" value=\"{}\"></p>".format( offset) #out += "<p> How many citations do you want to see? " out += "<input type=\"hidden\" name=\"LIM\" hidden=\"true\" value=\"{}\"></p>".format( mass1) #out += "<p> Starting with? " #out += "<input type=\"text\" name=\"OFF\" hidden=\"true\" value=\"{}\"></p>".format(mass0) if not offset == 1: # if it's not the first document out += "<p><button type=\"submit\" name=\"PREV_DOCS\" value=\"prev\">Previous documents</button>" if not len( res) < limit + 1: # if we found less documents than limit filenames = sorted(res.keys())[:-1] out += "<button type=\"submit\" name=\"NEXT_DOCS\" value=\"next\">Next documents</button></p>" else: filenames = sorted(res.keys()) out += "<ol>" for ind, r in enumerate(filenames): quotes = list(res[r]) out += "<li><p><b>{}</b></p><ul>".format(r) for qi, q in enumerate(quotes): if not qi == mass[ind][1] - 1: out += "<li>{}</li>".format(q) out += "</ul></li>" out += "<p> Сколько цитат показать? " out += "<input type=\"text\" name=\"NEW_LIM\" value=\"{}\"></p>".format( mass[ind][1] - 1) #print(mass[ind]) if not mass[ind][ 0] == 0: # if it's not the first page of citations out += "<p><button type=\"submit\" name =\"PREV_PAGE\" value=\"{}\">Previous citations</button>".format( ind) #print(len(quotes)) if not len(quotes) < mass[ind][ 1]: # if we found less citations than limit for citations out += "<button type=\"submit\" name=\"NEXT_PAGE\" value=\"{}\">Next citations</button></p>".format( ind) #out += "<p> Starting with " out += "<input type=\"text\" name=\"NEW_OFF\" hidden=\"true\" value=\"{}\"></p>".format( mass[ind][0] + 1) out += "</ol>" out += "<p><button type=\"submit\" name=\"RESET\" value=\"reset\">Back to search page</button></p></body></html>" self.wfile.write(bytes(out, 'utf-8'))
def do_POST(self): form = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={ 'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': 'text/html; charset=utf-8', }) self.send_response(200) self.send_header('Content-type', 'text/html; charset=utf-8') self.end_headers() query = form.getfirst("QUERY", "") query = html.escape(query) limit = form.getfirst("LIMIT", "2") # document limit limit = int(html.escape(limit)) offset = form.getfirst("OFFSET", "1") # document offset offset = int(html.escape(offset)) mass0 = form.getfirst("OFF", "1") # initial cit offset mass0 = int(html.escape(mass0)) mass1 = form.getfirst("LIM", "10") # initial citat limit mass1 = int(html.escape(mass1)) cit_lim = form.getlist("NEW_LIM") cit_off = form.getlist("NEW_OFF") button_prev = form.getlist("PREV_PAGE") button_next = form.getlist("NEXT_PAGE") doc_prev = form.getlist("PREV_DOCS") doc_next = form.getlist("NEXT_DOCS") reset = form.getlist("RESET") print(doc_prev, doc_next) mass = [] if not button_prev: if not button_next: if not cit_lim: for i in range(limit): mass.append( (mass0 - 1, mass1 + 1)) # default array of (offset, limit) pairs else: for i in range(limit): mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1)) else: for i in range(limit): #print(i, button_next[0]) if not i == int(button_next[0]): mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1)) else: #print(mass0, mass1) mass.append((int(cit_off[i]) - 1 + int(cit_lim[i]), int(cit_lim[i]) + 1)) else: for i in range(limit): if not i == int(button_prev[0]): mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1)) else: mass.append((int(cit_off[i]) - 1 - int(cit_lim[i]), int(cit_lim[i]) + 1)) mass.append((0, 0)) print(mass) if doc_prev: offset = offset - limit if doc_next: offset = offset + limit #print(query) se = self.server.se width = int(self.server.config["Default"]["context w width"]) t = Tokenizer().alph_tokenize(query) if len(list(t)) > 1: r = se.search_mult(query, limit + 1, offset - 1) else: r = se.search(query, limit + 1, offset - 1) cws = se.get_context_w(r, width) comb = se.combine_cw(cws) res = se.ultimate_out(comb, mass) print(res) if reset: out = "" out += "<html><body>" out += "<form method=\"POST\" action=\"\">" out += "<p> Введите поисковой запрос: " out += "<input type=\"text\" name=\"QUERY\"><input type=\"submit\"></p>" out += "<p> Сколько выдать документов: " out += "<input type=\"text\" name=\"LIMIT\"></p>" out += "<p> Начиная с какого: " out += "<input type=\"text\" name=\"OFFSET\"></p>" out += "<p> Сколько выдать цитат: " out += "<input type=\"text\" name=\"LIM\"></p>" out += "<p> Начиная с какой: " out += "<input type=\"text\" name=\"OFF\"></p>" out += "</form></body></html>" else: out = "" out += "<!DOCTYPE HTML><html><body>" out += "<form method=\"POST\" action=\"\">" out += "<p> Введите поисковой запрос: " out += "<input type=\"text\" name=\"QUERY\" value=\"{}\"><input type=\"submit\"></p>".format( query) out += "<p> Сколько выдать документов: " out += "<input type=\"text\" name=\"LIMIT\" value=\"{}\"></p>".format( limit) #out += "<p> Начиная с какого: " out += "<p><input type=\"text\" name=\"OFFSET\" hidden=\"true\" value=\"{}\"></p>".format( offset) #out += "<p> Сколько выдавать цитат: " out += "<input type=\"hidden\" name=\"LIM\" hidden=\"true\" value=\"{}\"></p>".format( mass1) #out += "<p> Начиная с какой: " #out += "<input type=\"text\" name=\"OFF\" hidden=\"true\" value=\"{}\"></p>".format(mass0) if not offset == 1: out += "<p><button type=\"submit\" name=\"PREV_DOCS\" value=\"prev\">Предыдущие документы</button>" if not len(res) < limit + 1: filenames = sorted(res.keys())[:-1] print(filenames) out += "<button type=\"submit\" name=\"NEXT_DOCS\" value=\"next\">Следующие документы</button></p>" else: filenames = sorted(res.keys()) out += "<ol>" for ind, r in enumerate(filenames): quotes = list(res[r]) out += "<li><p><b>{}</b></p><ul>".format(r) for qi, q in enumerate(quotes): if not qi == mass[ind][1] - 1: out += "<li>{}</li>".format(q) out += "</ul></li>" out += "<p> Сколько цитат показать? " out += "<input type=\"text\" name=\"NEW_LIM\" value=\"{}\"></p>".format( mass[ind][1] - 1) print(mass[ind]) if not mass[ind][0] == 0: out += "<p><button type=\"submit\" name =\"PREV_PAGE\" value=\"{}\">Предыдущая страница</button>".format( ind) print(len(quotes)) if not len(quotes) < mass[ind][1]: out += "<button type=\"submit\" name=\"NEXT_PAGE\" value=\"{}\">Следующая страница</button></p>".format( ind) #out += "<p> Начиная с какой? " out += "<input type=\"text\" name=\"NEW_OFF\" hidden=\"true\" value=\"{}\"></p>".format( mass[ind][0] + 1) out += "</ol>" out += "<p><button type=\"submit\" name=\"RESET\" value=\"reset\">Назад к странице поиска</button></p></body></html>" self.wfile.write(bytes(out, 'utf-8'))