Python Tokenizer Examples, tokenizer_gen.Tokenizer Python Examples

Example #1

0

Show file

    def search_mult(self, query, limit, offset):
        """Multiword search.

        :return: a dictionary with the file names of
        the files that contain all words of the query as the keys
        and all Positions in that file of the words of the query as the values.  

        :param db: database containing file(s)
               query: input query
        """
        self.query = query
        t = Tokenizer()
        res = []  # list for dictionaries of search results
        fs = []  # list for sets of names of files
        output = {}
        dic = self.db
        for i in t.alph_tokenize(query):
            #print(i)
            if not dic.get(i.tok) in res:
                res.append(dic.get(i.tok))
        # create list of sets of filenames for each word
        for f in res:
            fs.append(set(f.keys()))
        for r in sorted(
                list(set.intersection(*fs))
        )[offset:offset +
          limit]:  # get files that contain all the words of the query
            for item in res:
                output.setdefault(r, []).append(item[r])
        # sort positions by line and start index
        for el in output:
            output[el] = our_sort(output[el])
        return output

Example #2

0

Show file

File: tokenizer_gen_test.py Project: darya-den/search-server

 def test_num(self):
     """ Non-alphabetical tokens """
     t = Tokenizer()
     res = list(t.i_tokenize("token123toj, "))
     self.assertEqual(len(res), 5)
     self.assertIsInstance(res[0], Token)
     self.assertIsInstance(res[1], Token)
     self.assertIsInstance(res[2], Token)
     self.assertIsInstance(res[3], Token)
     self.assertIsInstance(res[4], Token)
     self.assertEqual(res[0].tok, "token")
     self.assertEqual(res[1].tok, "123")
     self.assertEqual(res[2].tok, "toj")
     self.assertEqual(res[3].tok, ",")
     self.assertEqual(res[4].tok, " ")
     self.assertEqual(res[0].f_ch, 0)
     self.assertEqual(res[1].f_ch, 5)
     self.assertEqual(res[2].f_ch, 8)
     self.assertEqual(res[3].f_ch, 11)
     self.assertEqual(res[4].f_ch, 12)
     self.assertEqual(res[0].l_ch, 4)
     self.assertEqual(res[1].l_ch, 7)
     self.assertEqual(res[2].l_ch, 10)
     self.assertEqual(res[3].l_ch, 11)
     self.assertEqual(res[4].l_ch, 12)
     self.assertEqual(res[0].typ, "alph")
     self.assertEqual(res[1].typ, "digit")
     self.assertEqual(res[2].typ, "alph")
     self.assertEqual(res[3].typ, "punct")
     self.assertEqual(res[4].typ, "space")

Example #3

0

Show file

File: tokenizer_gen_test.py Project: darya-den/search-server

 def test_singletoken(self):
     """ String with one token """
     t = Tokenizer()
     res = list(t.i_tokenize("мама"))
     self.assertEqual(len(res), 1)
     self.assertIsInstance(res[0], Token)
     self.assertEqual(res[0].tok, "мама")
     self.assertEqual(res[0].f_ch, 0)
     self.assertEqual(res[0].l_ch, 3)
     self.assertEqual(res[0].typ, "alph")

Example #4

0

Show file

    def indexate(self, s):
        """Indexate a string.

        :return: dictionary of each unique Token of a string and a list of its Positions
        :param s: indexated string
        """
        self.s = s
        d = {}
        t = Tokenizer()
        res = t.i_tokenize(s)
        for i in res:
            # include only alpha and digit tokens
            if i.typ == "alph" or i.typ == "digit":
                d.setdefault(i.tok, []).append(Position(i.f_ch, i.l_ch))
        return d

Example #5

0

Show file

    def lfile_indexate(self, s):
        """Indexate a string from a file.

        :return: dictionary of each unique Token from a file and a list of Position_d for that Token
        :param s: path of the file
        """
        self.s = s
        d = {}
        t = Tokenizer()
        with open(s, 'r', encoding="utf-8") as f:
            for i, line in enumerate(f):
                for it in t.i_tokenize(line.lower()):
                    # include only alpha and digit tokens
                    if it.typ == "alph" or it.typ == "digit":
                        d.setdefault(it.tok, {}).setdefault(s, []).append(
                            Position_d(it.f_ch, it.l_ch, i))
        return d

Example #6

0

Show file

File: tokenizer_gen_test.py Project: darya-den/search-server

 def test_lastalph(self):
     """ String that ends with alphabetical symbol """
     t = Tokenizer()
     res = list(t.i_tokenize("то есть"))
     self.assertEqual(len(res), 3)
     self.assertIsInstance(res[0], Token)
     self.assertIsInstance(res[1], Token)
     self.assertIsInstance(res[2], Token)
     self.assertEqual(res[0].tok, "то")
     self.assertEqual(res[1].tok, " ")
     self.assertEqual(res[2].tok, "есть")
     self.assertEqual(res[0].f_ch, 0)
     self.assertEqual(res[1].f_ch, 2)
     self.assertEqual(res[2].f_ch, 3)
     self.assertEqual(res[0].l_ch, 1)
     self.assertEqual(res[1].l_ch, 2)
     self.assertEqual(res[2].l_ch, 6)
     self.assertEqual(res[0].typ, "alph")
     self.assertEqual(res[1].typ, "space")
     self.assertEqual(res[2].typ, "alph")

Example #7

0

Show file

    def search_mult_stem(self, query, limit, offset):
        """Multiword search with stemming.

        :return: a dictionary with the file names of
        the files that contain all stems/lemmas of the query words as the keys
        and a generator of all Positions query words stems/lemmas in that file.  

        :param query: input query
               limit: number of files
               offset: index of the first file (starting at 1)
        """
        t = Tokenizer()
        stemmer = Stemmer_agent()
        res = []  # list for dictionaries of search results
        fs = []  # list for sets of names of files
        output = {}
        dic = self.db
        for i in t.alph_tokenize(query):
            #print(i)
            stems = {}
            for st in stemmer.stem(i.tok):
                if st in dic:
                    #print(st)
                    for fn in dic.get(st).keys():
                        stems.setdefault(fn, []).extend(dic.get(st)[fn])
            res.append(stems)
        #for f in res:
        #   print(f)
        # create list of sets of filenames for each word
        for f in res:
            fs.append(set(f.keys()))
        for r in sorted(
                list(set.intersection(*fs))
        )[offset:offset +
          limit]:  # get files that contain all the words of the query
            for item in res:
                output.setdefault(r, []).append(item[r])
        # sort positions by line and start index
        for el in output:
            output[el] = our_sort(output[el])
        return output

Example #8

0

Show file

    def db_file_indexate(self, name, s):
        """Indexate file and create a database.

        :return: database containing a dictionary of Tokens and a dictionary of filenames where
        the Token was found and a list of Positions_d objects
        :param name: name of a file of a shelve database
               s: path of a indexated file
        """
        self.name = name
        self.s = s
        t = Tokenizer()
        db = shelve.open(name, 'c', writeback=True)
        with open(s, 'r', encoding="utf-8") as f:
            for i, line in enumerate(f):
                for it in t.i_tokenize(line):
                    # include only alpha and digit tokens
                    if it.typ == "alph" or it.typ == "digit":
                        x = db.setdefault(it.tok, {})
                        x.setdefault(s, []).append(
                            Position_d(it.f_ch, it.l_ch, i))
        db.close()

Example #9

0

Show file

    def stem_indexate(self, name, s):
        """Indexate file and create a database.

        :return: database containing a dictionary of lemmas/stems
        and a dictionary of filenames where
        the lemma/stem were found and a list of Positions_d objects
        :param name: name of a file of a shelve database
               s: path of a indexated file
        """
        t = Tokenizer()
        stemmer = Stemmer_agent()
        db = shelve.open(name, 'c', writeback=True)
        with open(s, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                for it in t.i_tokenize(line):
                    if it.typ == "alph" or it.typ == "digit":
                        # get the stem or lemma of the token
                        for st in stemmer.stem(it.tok):
                            x = db.setdefault(st, {})
                            x.setdefault(s, []).append(
                                Position_d(it.f_ch, it.l_ch, i))
        db.close()

Example #10

0

Show file

 def __init__(self, pos, line, ext):
     self.pos = pos
     self.line = line
     self.ext = ext
     # tokenize the line
     ws = list(Tokenizer().alph_tokenize(line))
     for n, word in enumerate(ws):
         # set right border of the context window
         if word.f_ch == pos[0].start:
             if n - ext >= 0:
                 self.left = ws[n - ext].f_ch
             else:
                 self.left = 0
             # set right border of the context window
         if word.l_ch == pos[-1].end:
             try:
                 self.right = ws[n + ext].l_ch
             except IndexError:
                 self.right = len(line) - 1

Example #11

0

Show file

File: tokenizer_gen_test.py Project: darya-den/search-server

 def test_ex(self):
     """ Not a string object """
     t = Tokenizer()
     with self.assertRaises(TypeError):
         list(t.i_tokenize(["j"]))

Example #12

0

Show file

File: tokenizer_gen_test.py Project: darya-den/search-server

 def test_empty(self):
     """ Empty string """
     t = Tokenizer()
     res = list(t.i_tokenize(""))
     self.assertEqual(len(res), 0)

Example #13

0

Show file

File: tokenizer_gen_test.py Project: darya-den/search-server

 def test_symbol(self):
     t = Tokenizer()
     res = list(t.alph_tokenize('b'))
     gold = [Token('b', 0, 0, "alph")]
     self.assertEqual(res, gold)

Example #14

0

Show file

File: server_cgi_stem.py Project: darya-den/search-server

 def do_POST(self):
     form = cgi.FieldStorage(fp=self.rfile,
                             headers=self.headers,
                             environ={
                                 'REQUEST_METHOD': 'POST',
                                 'CONTENT_TYPE': 'text/html; charset=utf-8',
                             })
     self.send_response(200)
     self.send_header('Content-type', 'text/html; charset=utf-8')
     self.end_headers()
     query = form.getfirst("QUERY", "")
     query = html.escape(query)
     limit = form.getfirst("LIMIT", "2")  # document limit
     limit = int(html.escape(limit))
     offset = form.getfirst("OFFSET", "1")  # document offset
     offset = int(html.escape(offset))
     mass0 = form.getfirst("OFF", "1")  # initial citat offset
     mass0 = int(html.escape(mass0))
     mass1 = form.getfirst("LIM", "10")  # initial citat limit
     mass1 = int(html.escape(mass1))
     cit_lim = form.getlist("NEW_LIM")
     cit_off = form.getlist("NEW_OFF")
     # button values
     button_prev = form.getlist("PREV_PAGE")
     button_next = form.getlist("NEXT_PAGE")
     doc_prev = form.getlist("PREV_DOCS")
     doc_next = form.getlist("NEXT_DOCS")
     reset = form.getlist("RESET")
     print(doc_prev, doc_next)
     mass = []
     if not button_prev:
         if not button_next:
             if not cit_lim:
                 for i in range(limit):
                     mass.append(
                         (mass0 - 1, mass1 +
                          1))  # default array of (offset, limit) pairs
             else:
                 for i in range(limit):
                     mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1))
         else:
             for i in range(limit):
                 #print(i, button_next[0])
                 if not i == int(button_next[0]):
                     mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1))
                 else:
                     #print(mass0, mass1)
                     mass.append((int(cit_off[i]) - 1 + int(cit_lim[i]),
                                  int(cit_lim[i]) + 1))
     else:
         for i in range(limit):
             if not i == int(button_prev[0]):
                 mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1))
             else:
                 mass.append((int(cit_off[i]) - 1 - int(cit_lim[i]),
                              int(cit_lim[i]) + 1))
     mass.append((0, 0))
     print(mass)
     if doc_prev:
         offset = offset - limit
     if doc_next:
         offset = offset + limit
     se = self.server.se
     width = int(self.server.config["Default"]["context w width"])
     t = Tokenizer().alph_tokenize(query)
     if len(list(t)) > 1:
         r = se.search_mult_stem(query, limit + 1, offset - 1)
     else:
         r = se.search_stem(query, limit + 1, offset - 1)
     cws = se.get_context_w(r, width)
     comb = se.combine_cw(cws)
     res = se.ultimate_out(comb, mass)
     print(res)
     if reset:
         out = ""
         out += "<html><body>"
         out += "<form method=\"POST\" action=\"\">"
         out += "<p> What do you want to search? "
         out += "<input type=\"text\" name=\"QUERY\"><input type=\"submit\"></p>"
         out += "<p> How many documents do you want to see? "
         out += "<input type=\"text\" name=\"LIMIT\"></p>"
         out += "<p> Starting with? "
         out += "<input type=\"text\" name=\"OFFSET\"></p>"
         out += "<p> How many citations do you want to see? "
         out += "<input type=\"text\" name=\"LIM\"></p>"
         out += "<p> Starting with? "
         out += "<input type=\"text\" name=\"OFF\"></p>"
         out += "</form></body></html>"
     else:
         out = ""
         out += "<!DOCTYPE HTML><html><body>"
         out += "<form method=\"POST\" action=\"\">"
         out += "<p> What do you want to search? "
         out += "<input type=\"text\" name=\"QUERY\" value=\"{}\"><input type=\"submit\"></p>".format(
             query)
         #out += "<p> How many documents do you want to see? "
         out += "<input type=\"text\" name=\"LIMIT\" hidden=\"true\" value=\"{}\"></p>".format(
             limit)
         #out += "<p> Starting with? "
         out += "<p><input type=\"text\" name=\"OFFSET\" hidden=\"true\" value=\"{}\"></p>".format(
             offset)
         #out += "<p> How many citations do you want to see? "
         out += "<input type=\"hidden\" name=\"LIM\" hidden=\"true\" value=\"{}\"></p>".format(
             mass1)
         #out += "<p> Starting with? "
         #out += "<input type=\"text\" name=\"OFF\" hidden=\"true\" value=\"{}\"></p>".format(mass0)
         if not offset == 1:  # if it's not the first document
             out += "<p><button type=\"submit\" name=\"PREV_DOCS\" value=\"prev\">Previous documents</button>"
         if not len(
                 res) < limit + 1:  # if we found less documents than limit
             filenames = sorted(res.keys())[:-1]
             out += "<button type=\"submit\" name=\"NEXT_DOCS\" value=\"next\">Next documents</button></p>"
         else:
             filenames = sorted(res.keys())
         out += "<ol>"
         for ind, r in enumerate(filenames):
             quotes = list(res[r])
             out += "<li><p><b>{}</b></p><ul>".format(r)
             for qi, q in enumerate(quotes):
                 if not qi == mass[ind][1] - 1:
                     out += "<li>{}</li>".format(q)
             out += "</ul></li>"
             out += "<p> Сколько цитат показать? "
             out += "<input type=\"text\" name=\"NEW_LIM\" value=\"{}\"></p>".format(
                 mass[ind][1] - 1)
             #print(mass[ind])
             if not mass[ind][
                     0] == 0:  # if it's not the first page of citations
                 out += "<p><button type=\"submit\" name =\"PREV_PAGE\" value=\"{}\">Previous citations</button>".format(
                     ind)
             #print(len(quotes))
             if not len(quotes) < mass[ind][
                     1]:  # if we found less citations than limit for citations
                 out += "<button type=\"submit\" name=\"NEXT_PAGE\" value=\"{}\">Next citations</button></p>".format(
                     ind)
             #out += "<p> Starting with "
             out += "<input type=\"text\" name=\"NEW_OFF\" hidden=\"true\" value=\"{}\"></p>".format(
                 mass[ind][0] + 1)
         out += "</ol>"
         out += "<p><button type=\"submit\" name=\"RESET\" value=\"reset\">Back to search page</button></p></body></html>"
     self.wfile.write(bytes(out, 'utf-8'))

Example #15

0

Show file

File: server_cgi.py Project: darya-den/search-server

 def do_POST(self):
     form = cgi.FieldStorage(fp=self.rfile,
                             headers=self.headers,
                             environ={
                                 'REQUEST_METHOD': 'POST',
                                 'CONTENT_TYPE': 'text/html; charset=utf-8',
                             })
     self.send_response(200)
     self.send_header('Content-type', 'text/html; charset=utf-8')
     self.end_headers()
     query = form.getfirst("QUERY", "")
     query = html.escape(query)
     limit = form.getfirst("LIMIT", "2")  # document limit
     limit = int(html.escape(limit))
     offset = form.getfirst("OFFSET", "1")  # document offset
     offset = int(html.escape(offset))
     mass0 = form.getfirst("OFF", "1")  # initial cit offset
     mass0 = int(html.escape(mass0))
     mass1 = form.getfirst("LIM", "10")  # initial citat limit
     mass1 = int(html.escape(mass1))
     cit_lim = form.getlist("NEW_LIM")
     cit_off = form.getlist("NEW_OFF")
     button_prev = form.getlist("PREV_PAGE")
     button_next = form.getlist("NEXT_PAGE")
     doc_prev = form.getlist("PREV_DOCS")
     doc_next = form.getlist("NEXT_DOCS")
     reset = form.getlist("RESET")
     print(doc_prev, doc_next)
     mass = []
     if not button_prev:
         if not button_next:
             if not cit_lim:
                 for i in range(limit):
                     mass.append(
                         (mass0 - 1, mass1 +
                          1))  # default array of (offset, limit) pairs
             else:
                 for i in range(limit):
                     mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1))
         else:
             for i in range(limit):
                 #print(i, button_next[0])
                 if not i == int(button_next[0]):
                     mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1))
                 else:
                     #print(mass0, mass1)
                     mass.append((int(cit_off[i]) - 1 + int(cit_lim[i]),
                                  int(cit_lim[i]) + 1))
     else:
         for i in range(limit):
             if not i == int(button_prev[0]):
                 mass.append((int(cit_off[i]) - 1, int(cit_lim[i]) + 1))
             else:
                 mass.append((int(cit_off[i]) - 1 - int(cit_lim[i]),
                              int(cit_lim[i]) + 1))
     mass.append((0, 0))
     print(mass)
     if doc_prev:
         offset = offset - limit
     if doc_next:
         offset = offset + limit
     #print(query)
     se = self.server.se
     width = int(self.server.config["Default"]["context w width"])
     t = Tokenizer().alph_tokenize(query)
     if len(list(t)) > 1:
         r = se.search_mult(query, limit + 1, offset - 1)
     else:
         r = se.search(query, limit + 1, offset - 1)
     cws = se.get_context_w(r, width)
     comb = se.combine_cw(cws)
     res = se.ultimate_out(comb, mass)
     print(res)
     if reset:
         out = ""
         out += "<html><body>"
         out += "<form method=\"POST\" action=\"\">"
         out += "<p> Введите поисковой запрос: "
         out += "<input type=\"text\" name=\"QUERY\"><input type=\"submit\"></p>"
         out += "<p> Сколько выдать документов: "
         out += "<input type=\"text\" name=\"LIMIT\"></p>"
         out += "<p> Начиная с какого: "
         out += "<input type=\"text\" name=\"OFFSET\"></p>"
         out += "<p> Сколько выдать цитат: "
         out += "<input type=\"text\" name=\"LIM\"></p>"
         out += "<p> Начиная с какой: "
         out += "<input type=\"text\" name=\"OFF\"></p>"
         out += "</form></body></html>"
     else:
         out = ""
         out += "<!DOCTYPE HTML><html><body>"
         out += "<form method=\"POST\" action=\"\">"
         out += "<p> Введите поисковой запрос: "
         out += "<input type=\"text\" name=\"QUERY\" value=\"{}\"><input type=\"submit\"></p>".format(
             query)
         out += "<p> Сколько выдать документов: "
         out += "<input type=\"text\" name=\"LIMIT\" value=\"{}\"></p>".format(
             limit)
         #out += "<p> Начиная с какого: "
         out += "<p><input type=\"text\" name=\"OFFSET\" hidden=\"true\" value=\"{}\"></p>".format(
             offset)
         #out += "<p> Сколько выдавать цитат: "
         out += "<input type=\"hidden\" name=\"LIM\" hidden=\"true\" value=\"{}\"></p>".format(
             mass1)
         #out += "<p> Начиная с какой: "
         #out += "<input type=\"text\" name=\"OFF\" hidden=\"true\" value=\"{}\"></p>".format(mass0)
         if not offset == 1:
             out += "<p><button type=\"submit\" name=\"PREV_DOCS\" value=\"prev\">Предыдущие документы</button>"
         if not len(res) < limit + 1:
             filenames = sorted(res.keys())[:-1]
             print(filenames)
             out += "<button type=\"submit\" name=\"NEXT_DOCS\" value=\"next\">Следующие документы</button></p>"
         else:
             filenames = sorted(res.keys())
         out += "<ol>"
         for ind, r in enumerate(filenames):
             quotes = list(res[r])
             out += "<li><p><b>{}</b></p><ul>".format(r)
             for qi, q in enumerate(quotes):
                 if not qi == mass[ind][1] - 1:
                     out += "<li>{}</li>".format(q)
             out += "</ul></li>"
             out += "<p> Сколько цитат показать? "
             out += "<input type=\"text\" name=\"NEW_LIM\" value=\"{}\"></p>".format(
                 mass[ind][1] - 1)
             print(mass[ind])
             if not mass[ind][0] == 0:
                 out += "<p><button type=\"submit\" name =\"PREV_PAGE\" value=\"{}\">Предыдущая страница</button>".format(
                     ind)
             print(len(quotes))
             if not len(quotes) < mass[ind][1]:
                 out += "<button type=\"submit\" name=\"NEXT_PAGE\" value=\"{}\">Следующая страница</button></p>".format(
                     ind)
             #out += "<p> Начиная с какой? "
             out += "<input type=\"text\" name=\"NEW_OFF\" hidden=\"true\" value=\"{}\"></p>".format(
                 mass[ind][0] + 1)
         out += "</ol>"
         out += "<p><button type=\"submit\" name=\"RESET\" value=\"reset\">Назад к странице поиска</button></p></body></html>"
     self.wfile.write(bytes(out, 'utf-8'))