Esempio n. 1
0
 def test_remove_doc(self):
     index = Levenshtein_search.populate_wordset(-1, self.excerpt1)
     Levenshtein_search.remove_string(index, 'overcoat')
     results = Levenshtein_search.lookup(index, 'overcoat', 6)
     assert results == [['went', 6, 0.024390243902439025],
                        ['cold', 6, 0.024390243902439025],
                        ['Versh', 6, 0.04878048780487805],
                        ['overshoes', 4, 0.04878048780487805],
                        ['not', 6, 0.024390243902439025]]
Esempio n. 2
0
 def search(self, doc, threshold=0):
     matching_docs = Levenshtein_search.lookup(self.index_key, doc,
                                               threshold)
     if matching_docs:
         return [self._doc_to_id[match] for match, _, _ in matching_docs]
     else:
         return []
Esempio n. 3
0
    def __init__(self):
        self.index_key = Levenshtein_search.populate_wordset(-1, [])

        try : # py 2
            self._doc_to_id = collections.defaultdict(itertools.count(1).next)
        except AttributeError : # py 3
            self._doc_to_id = collections.defaultdict(itertools.count(1).__next__)

        self.docs = []
Esempio n. 4
0
    def test_query_overcoat(self):
        index = Levenshtein_search.populate_wordset(-1, self.excerpt1)
        results = Levenshtein_search.lookup(index, 'overcoat', 6)
        assert results == [['overcoat', 0, 0.023809523809523808],
                           ['went', 6, 0.023809523809523808],
                           ['cold', 6, 0.023809523809523808],
                           ['Versh', 6, 0.047619047619047616],
                           ['overshoes', 4, 0.047619047619047616],
                           ['not', 6, 0.023809523809523808]]

        
        index = Levenshtein_search.populate_wordset(-1, self.excerpt2)
        results = Levenshtein_search.lookup(index, 'overcoat', 6)
        assert results == [['Versh', 6, 0.044444444444444446],
                           ['overshoes', 4, 0.022222222222222223],
                           ['coat', 4, 0.022222222222222223],
                           ['out', 6, 0.044444444444444446],
                           ['here', 6, 0.022222222222222223]]
Esempio n. 5
0
    def __init__(self):
        self.index_key = Levenshtein_search.populate_wordset(-1, [])

        try:  # py 2
            self._doc_to_id = collections.defaultdict(itertools.count(1).next)
        except AttributeError:  # py 3
            self._doc_to_id = collections.defaultdict(
                itertools.count(1).__next__)

        self.docs = []
Esempio n. 6
0
 def index(self, doc):
     self._doc_to_id[doc]
     Levenshtein_search.add_string(self.index_key, doc)
Esempio n. 7
0
 def __del__(self):
     Levenshtein_search.clear_wordset(self.index_key)
Esempio n. 8
0
    def search(self, doc, threshold=0):
        results = Levenshtein_search.lookup(self.index_key, doc, threshold)

        return [doc for doc, _, _ in results]
Esempio n. 9
0
 def unindex(self, doc):
     del self._doc_to_id[doc]
     Levenshtein_search.remove_string(self.index_key, doc)
Esempio n. 10
0
 def index(self, doc):
     i = self._doc_to_id[doc]
     Levenshtein_search.add_string(self.index_key, doc)
import Levenshtein_search

conn = psycopg2.connect("host='127.0.0.1' port='5432' dbname='benchmark' user='******' password=''")
cur = conn.cursor()
cur.execute("set schema 'public';")
query_word = "\"philippe the original\""
max_dist = 2
sqlquery = "select name from restaurant_nophone_training where levenshtein_less_equal(name, '" + query_word + "', " + str(max_dist) + ") <= " + str(max_dist) + ";"
print(sqlquery)
starttime = time.clock()
cur.execute(sqlquery)
results = cur.fetchall()
print(str(time.clock() - starttime) + " sec")
print(results)
print(" ")

print("Levenshtein_search algorithm:")
cur.execute("select name from restaurant_nophone_training")
names = cur.fetchall()
namelist = []
for name in names:
    namelist.append(name[0])
	
idx = Levenshtein_search.populate_wordset(-1,namelist)
starttime = time.clock()
results = Levenshtein_search.lookup(idx,query_word,max_dist)
print(str(time.clock() - starttime) + " sec")
print(results)

Levenshtein_search.clear_wordset(idx)
conn.close()
Esempio n. 12
0
 def __del__(self):
     Levenshtein_search.clear_wordset(self.index_key)
Esempio n. 13
0
 def unindex(self, doc):
     del self._doc_to_id[doc]
     Levenshtein_search.remove_string(self.index_key, doc)
Esempio n. 14
0
 def __init__(self):
     self.index_key = Levenshtein_search.populate_wordset(-1, [])
     self._doc_to_id = Enumerator(start=1)
Esempio n. 15
0
 def unindex(self, doc):
     del self._doc_to_id[doc]
     Levenshtein_search.clear_wordset(self.index_key)
     self.index_key = Levenshtein_search.populate_wordset(
         -1, list(self._doc_to_id))
Esempio n. 16
0
 def search(self, doc, threshold=0):
     results = Levenshtein_search.lookup(self.index_key, doc, threshold)
     if results:
         return [self._doc_to_id[doc] for doc, _, _ in results]
     else:
         return []
Esempio n. 17
0
 def __init__(self):
     self.index_key = Levenshtein_search.populate_wordset(-1, [])
     self._doc_to_id = Enumerator(start=1)
Esempio n. 18
0
 def search(self, doc, threshold=0):
     matching_docs = Levenshtein_search.lookup(self.index_key, doc, threshold)
     if matching_docs:
         return [self._doc_to_id[match] for match, _, _ in matching_docs]
     else:
         return []
Esempio n. 19
0
 def search(self, doc, threshold=0):
     results = Levenshtein_search.lookup(self.index_key, doc, threshold)
     
     return [doc for doc, _, _ in results]
Esempio n. 20
0
 def test_clear(self):
     index = Levenshtein_search.populate_wordset(-1, self.excerpt1)
     Levenshtein_search.clear_wordset(index)        
Esempio n. 21
0
 def test_index_increment(self):
     first = Levenshtein_search.populate_wordset(-1, self.excerpt1)
     second = Levenshtein_search.populate_wordset(-1, self.excerpt2)
     print(first, second)
     
     assert first != second
Esempio n. 22
0
 def search(self, doc, threshold=0):
     results = Levenshtein_search.lookup(self.index_key, doc, threshold)
     if results:
         return [self._doc_to_id[doc] for doc, _, _ in results]
     else:
         return []