def __init__(self, test_item_table=None): if test_item_table: self.items = pytrie.StringTrie(**test_item_table) else: all_items_list = db.get_all_items() all_items_dict = {} for item_id, item_name in all_items_list: lowercase_name = item_name.lower() all_items_dict[lowercase_name] = item_id self.items = pytrie.StringTrie(**all_items_dict)
def setUp(self): """Set up self._trie with 111 tokens, one of them a blessed version.""" self._trie = pytrie.StringTrie() self._store = EphemeralStore() blessed_version = BlessedVersion(MasterHandler._BLESSED_VERSION, MasterHandler._MASTER_OWNER) for i in range(0, 10): some_token = Token(blessed_version.advance_version(), '/some_dir/some_token_%d' % i, priority=i, data='some_data_%d' % i) self._trie[some_token.name] = some_token self._store.commit_tokens(updates=[some_token]) for j in range(0, 10): some_other_token = Token( blessed_version.advance_version(), '/some_dir/some_token_%d/some_other_token_%d' % (i, j), priority=j, data='some_data_%d_%d' % (i, j)) self._trie[some_other_token.name] = some_other_token self._store.commit_tokens(updates=[some_other_token]) blessed_version.advance_version() self._trie[MasterHandler._BLESSED_VERSION] = blessed_version self._store.commit_tokens(updates=[blessed_version]) self._check_version_uniqueness()
def __init__(self): self.buckets = {} self.words = {} self.wilds = pytrie.StringTrie() dictDir = abspath(join(dirname(__file__), "../dictionaries")) self._load_dict("%s/LIWC2007_English100131.dic" % dictDir) self._load_dict("%s/tiptap.dic" % dictDir)
def getStopWords(): f = open("finalstoplist", "r") global stop_list stop_list = pytrie.StringTrie() l = re.split('[\s+]', f.read()) for i in l: stop_list[i] = 0 f.close()
def init_trie(self): """Initialize trie with current data in geodata table""" rethink_conn = rethinkdb.connect(db='hotel_cosmos', host=os.environ['RETHINK_IP'], port=28015, user="******", password=os.environ['RETHINK_PASS']) geodata = list(rethinkdb.table('geodata').run(rethink_conn)) for row in geodata: reverse_hotel_key = '' if row['type'] == 'hotel': name = re.sub(r'hotel','', row['name'].lower(), re.IGNORECASE) key = translate(name + ' ' + row['city'].lower()) reverse_hotel_key = translate(row['city'].lower() + ' ' + name) elif row['type'] == 'zip': key = translate(row['name'].lower() + ' ' + row['city'].lower()) elif row['type'] == 'city': key = translate(row['name'].lower() + ' ' + row['country'].lower()) elif row['type'] == 'street': key = translate(row['name'].lower()) if key not in self.d: # remove unneeded data key = re.sub('\s+', '', key) row.pop('id', None) row.pop('timeStampAdded', None) if row['type'] != 'zip': row.pop('country', None) row.pop('index_country', None) self.d[key] = row if reverse_hotel_key != '': if reverse_hotel_key not in self.d: reverse_hotel_key = re.sub('\s+', '', reverse_hotel_key) # remove unneeded data row.pop('id', None) row.pop('timeStampAdded', None) row.pop('country', None) row.pop('index_country', None) self.d[reverse_hotel_key] = row self.trie = pytrie.StringTrie(self.d) # delete dictionary self.d.clear() del self.d # close connection to rethinkdb rethink_conn.close()
def native_load_data(path_to_data): """ Load the longest version of the trie, containing most n-grams :param path_to_data: path to the n-gram corpus :return: the trie, which also gets stored on the drive """ with codecs.open(path_to_data, "r", encoding='utf-8', errors='ignore') as fdata: grams = pd.read_table(fdata, names=["freq", "first", "second"]) grams['freq'] = grams['freq'].apply(lambda x: (x, )) freqs = grams['freq'].values phrases = grams['first'] + " " + grams['second'] res = dict(zip(phrases, freqs)) pytrie1 = pytrie.StringTrie(res) with open('pytrie.pkl', 'wb') as output: pickle.dump(pytrie1, output, pickle.HIGHEST_PROTOCOL) return pytrie1
def __init__(self): self.storage = pytrie.StringTrie() self.points_by_id = {}
def __init__(self, store): self._store = store self._trie = pytrie.StringTrie() self._lock = threading.Lock() self._load_tokens()
def trie(self): startTime = time.time() print(f'pytrie ST start') trie = pytrie.StringTrie(zip(self.list, list(range(len(self.list))))) print(f'pytrie ST time elapsed: {time.time() - startTime:.2f}s') return (trie)
class SearchBox: tr = pytrie.StringTrie() def __init__(self): try: server = 'fopo2ibguo.database.windows.net' database = 'testingdacpac' username = '******' password = '******' driver = 'ODBC Driver 13 for SQL Server' self.conn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1443;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) except: print("I am unable to connect to the database") self.cursor = self.conn.cursor() self.cursor.execute("SELECT distinct Location from Employee") rows = self.cursor.fetchall() for row in rows: self.tr.__setitem__(row[0], ['Location', row[0]]) self.cursor.execute("SELECT * from profile") rows = self.cursor.fetchall() for row in rows: self.tr.__setitem__(row[1], ['ProfileId', row[0]]) for alias in row[2].split(','): self.tr.__setitem__(alias, ['ProfileId', row[0]]) self.cursor.execute("SELECT * from skillset") rows = self.cursor.fetchall() for row in rows: try: self.tr.__setitem__(row[1], ['SkillId', row[0]]) except: pass def contains(self, string): try: return self.tr.__getitem__(string) except: return [] def listToString(self, fieldName, listOfString): listString = "" i = 0 listLength = len(listOfString) print("LENGTH OF ", listLength) while i < listLength: # for item in listOfString: if listString == "": listString = " ( " + fieldName + " = " + str(listOfString[i]) else: listString += " or " + fieldName + " = " + str(listOfString[i]) i += 1 return listString + " )" def rowToList(self, rowList): rowString = " in (" for row in rowList: rowString += str(row[0]) + "," return rowString[:-1] + ")" def search(self, searchString): #searchString = searchString.lower() startTime = time.time() token = searchString.split(' ') expr = 0 queryString = "" tokenLength = len(token) i = 0 type = [] fieldStorage = {None: [None]} fieldNameStorage = {None: None} i = 0 prevField = None while i < tokenLength: type = self.contains(token[i]) if (len(type) == 0 and i < tokenLength - 1): type = self.contains(token[i] + " " + token[i + 1]) if (len(type) == 0 and i < tokenLength - 2): type = self.contains(token[i] + " " + token[i + 1] + " " + token[i + 2]) if len(type) > 0: print(token[i], type[0], "'" + type[0] + "'", type[1]) fieldNameStorage["'" + type[0] + "'"] = type[1] try: if type[0] == "experience": j = 1 flag = True while (i + j < tokenLength or i - j > 0) and flag: try: expr = int(token[i + j]) flag = False except: pass if flag == True: try: expr = int(token[i - j]) flag = False except: pass j += 1 try: print(expr) fieldStorage["'" + type[0] + "'"].append(expr) except: fieldStorage["'" + type[0] + "'"] = [ expr, ] else: if type[0] in fieldStorage: fieldStorage[type[0]].append(type[1]) prevField = type[0] else: fieldStorage[type[0]] = [ type[1], ] prevField = type[0] except: pass i += 1 fieldFlag = False for item in fieldStorage: if item != None: if fieldFlag == False: queryString += " select * from Employee_View where " + self.listToString( item, fieldStorage[item]) fieldFlag = True else: queryString += " and " + self.listToString( item, fieldStorage[item]) print(queryString) self.cursor.execute(queryString) rows = self.cursor.fetchall() for row in rows: print(row) i += 1 if i == 40: i = 0 input()
#!/usr/bin/env python3 import re import dateutil.parser import pytrie import db IS_SELLING_TRIE = pytrie.StringTrie(wts=True, selling=True, wtb=False, buying=False) # TODO: support things like WTS CoS 10 k (space between number and k) PRICE_REGEX = re.compile(r'^(\d*\.?\d*)(k|p|pp)?$') USELESS_PUNCTUATION_REGEX = re.compile(r'^[^\d\w]*(.*?)$') SPLIT_REGEX = re.compile(r"^\[[^ ]+ ([^]]+)] ([^ ]+) auctions, '(.+)'$") DIGIT_REGEX = re.compile(r'\d') DEBUG = False def debug_print(message): if DEBUG: print(message) def split_line(line): """Parses text and returns a timestamp, character, and message.""" # Lines like: [Sun Jan 01 13:45:35 2017] Toon auctions, 'WTS Ale' match = SPLIT_REGEX.match(line)