def test_when_file_does_not_exist(self): path = os.path.join(self.tempdir, 'foo.db') db = semidbm.open(path, 'n') db['foo'] = 'bar' self.assertEqual(db['foo'], b'bar') db.close() # Opening the file again should basically blank out # any existing database. db = semidbm.open(path, 'n') self.assertEqual(list(db.keys()), []) db.close()
def test_checksum_failure(self): db = semidbm.open(self.dbdir, 'c') db['key'] = 'value' db.close() # Change the first digit of the checksum data. data_file = self.open_data_file(mode='r') # 3:key15:<checksum>value # First checksum digit is 9 bytes into the file. beginning = data_file.read() new_digit = int(beginning[8]) + 1 data_file.close() data_file = self.open_data_file(mode='w') data_file.write(beginning[:8]) data_file.write(str(new_digit)) data_file.write(beginning[9:]) data_file.close() db = self.open_db_file(verify_checksums=True) with self.assertRaises(semidbm.DBMChecksumError): db['key'] # If checksums are not enabled, an exception is not raised. db = self.open_db_file(verify_checksums=False) try: db['key'] except semidbm.DBMChecksumError: self.fail("Checksums were suppose to be disabled.")
def __init__(self, csm): self.csm = csm # This uses dbm, so we open the DB file: filename = simbase.config.GetString('account-bridge-filename', 'account-bridge') self.dbm = semidbm.open(filename, 'c')
def test_load_empty_db(self): db = semidbm.open(self.dbdir, 'c') db.close() empty_db = self.open_db_file() keys = empty_db.keys() empty_db.close() self.assertEqual(keys, [])
def test_key_does_not_exist(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db.close() read_only = self.open_db_file() self.assertRaises(KeyError, read_only.__getitem__, 'bar') read_only.close()
def test_when_files_exist(self): db = self.open_db_file() db['foo'] = 'bar' db.close() db_write_mode = semidbm.open(self.dbdir, 'w') self.assertEqual(db_write_mode['foo'], b'bar') db_write_mode.close()
def test_open_read_multiple_times(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db.close() # Open then close db immediately. db2 = self.open_db_file() db2.close() read_only = self.open_db_file() self.assertEqual(read_only['foo'], b'bar') read_only.close()
def create(self): """Create a new on-disk database. @raise anydbm.error: If there's a problem creating the database. """ if self.filename: self.db = anydbm.open(self.filename, "n") #raises anydbm.error self.db["--Reserved--type"] = self.type self.db.sync() else: self.db = {}
def test_can_read_items(self): db = semidbm.open(self.dbdir, 'c') db['foo'] = 'bar' db['bar'] = 'baz' db['baz'] = 'foo' db.close() read_only = self.open_db_file() self.assertEqual(read_only[b'foo'], b'bar') self.assertEqual(read_only[b'bar'], b'baz') self.assertEqual(read_only[b'baz'], b'foo') read_only.close()
def __init__(self, dbdir, check_frequency=20, max_filesize=MAX_DISK_USAGE): import semidbm self._db = semidbm.open(dbdir, 'c') self._max_filesize = max_filesize # How frequently we check the file size of the cache. # If we check every 20 writes, then at worst case we overshoot # the max size by MAX_BODY_SIZE * check_frequency, or # about 20MB if we use the default values for everything. self._check_frequency = check_frequency self._counter = 0 # When we reach the max disk size, we disable # writing data to the cache. self._writes_enabled = True
def open(self): """Open a pre-existing on-disk database. @raise anydbm.error: If there's a problem opening the database. @raise ValueError: If the database is not of the right type. """ if not self.filename: raise ValueError("Can only open on-disk databases") self.db = anydbm.open(self.filename, "w") #raises anydbm.error try: if self.db["--Reserved--type"] != self.type: raise ValueError("Not a %s database" % self.type) except KeyError: raise ValueError("Not a recognized database")
def test_checksum_failure(self): db = semidbm.open(self.dbdir, 'c') db[b'key'] = b'value' db.close() data_file = self.open_data_file(mode='rb') contents = data_file.read() data_file.close() # Changing 'value' to 'Value' should cause a checksum failure. contents = contents.replace(b'value', b'Value') data_file = self.open_data_file(mode='wb') data_file.write(contents) data_file.close() db = self.open_db_file(verify_checksums=True) with self.assertRaises(semidbm.DBMChecksumError): db['key'] db.close() # If checksums are not enabled, an exception is not raised. db = self.open_db_file(verify_checksums=False) try: db['key'] except semidbm.DBMChecksumError: self.fail("Checksums were suppose to be disabled.") finally: db.close()
def __init__(self): if self._kanwadict is None: dictpath = resource_filename(__name__, 'kanwadict3.db') # FIXME: no hardcoded filename self._kanwadict = dbm.open(dictpath, 'r')
def open_db_file(self, **kwargs): # If they do not explicitly set verify_checksums # to something, default to it being on. if 'verify_checksums' not in kwargs: kwargs['verify_checksums'] = True return semidbm.open(self.dbdir, 'c', **kwargs)
def reader(self): self.close() self._file = dbm.open(self._path, 'r') return self
def writer(self): """Return a new writer. Will always create a new file""" self.close() self._file = dbm.open(self._path, 'n') return self
def __init__(self, name): super(Database, self).__init__() self.db = semidbm.open(os.path.join(directory(), name), 'c')
def open_db_file(self, **kwargs): return semidbm.open(self.dbdir, 'r', **kwargs)
def appender(self): """Return a new writer, preserving the file if it already exists""" self.close() self._file = dbm.open(self._path, 'c') return self
def __init__(self, dbm_file): self._dbm = semidbm.open(dbm_file, 'r')
def open(self): self._subcat_index = semidbm.open(self._subcat_index_file, 'c') self._supercat_index = semidbm.open(self._supercat_index_file, 'c') return self
def test_unicode_chars(self): db = semidbm.open(self.dbdir, 'c') # cafe with the e-accute. db[b'caf\xc3\xa9'] = b'caf\xc3\xa9' self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9') db.close()
def __init__(self): if self._kanwadict is None: dictpath = resource_filename( __name__, 'kanwadict3.db') # FIXME: no hardcoded filename self._kanwadict = dbm.open(dictpath, 'r')
def shelve_open_semidbm(filename, flag='c', protocol=None, writeback=False): import semidbm # pylint: disable=import-error return shelve.Shelf(semidbm.open(filename, flag), protocol, writeback)
# This file is the starting file for a rhyming program using NLTK. import semidbm import random import time from nltk.tokenize import RegexpTokenizer # Open the rhyming database syllablesDB = semidbm.open('words.db') rhymesDB = semidbm.open('rhymes.db') def rhyme(word, count): """Returns a list of all the words that rhyme with 'word' with 'count' number of syllables.""" # start = time.time() ##### try: wordSyllables = syllablesDB[word.upper()].decode().split()[0] # print wordSyllables ### wordRhymes = [word.decode() for word in rhymesDB[wordSyllables].split()] wordRhymes.remove(word.upper()) # print wordRhymes ### backlist = [x.lower() for x in wordRhymes if count == 0 or syllablesDB[x].split()[1] == count] except: backlist = [] # print 'rhyme: '+str(time.time() - start) ##### return backlist def rhymesWith(word1, word2): """Determines if two words rhyme."""
def __init__(self, filename, flag='c', protocol=pickle.HIGHEST_PROTOCOL): self._my_file = semidbm.open(filename, flag=flag) self._protocol = protocol
def __init__(self, db=None): if isinstance(db, str): self.db = semidbm.open(db, 'c') else: self.db = db
def __init__(self, conf=None): if conf is None: raise Exception('Path required.') if not os.path.exists(os.path.dirname(conf)): os.makedirs(os.path.dirname(conf)) self._db = semidbm.open(conf, 'c')
categories = collections.defaultdict(list) review_dict = {} terms_to_collect = set() businesses_to_collect = set() with open('Business.json') as f: businesses = {b['business_id'].encode('utf-8'): (b['name'].encode('utf-8'), [cat.encode('utf-8') for cat in b['categories'] if cat != 'Restaurants'], b['stars']) for b in json.load(f)} with open(city + '.json') as reviews: reviews_list = json.load(reviews) for review in reviews_list: review_dict[review['review_id'].encode('utf-8')] = (review['text'].encode('utf-8'), review['business_id'].encode('utf-8'), review['stars'], review['date'].encode('utf-8')) terms = tokenize_regex.findall(review['text'].lower()) if terms: review_weight = 1.0/len(terms) for term in terms: categories[term.encode('utf-8')].append((review['review_id'].encode('utf-8'), review_weight)) terms_to_collect.add(term.encode('utf-8')) businesses_to_collect.add(review['business_id'].encode('utf-8')) s = shelve.Shelf(semidbm.open(city + '-Baseline.db', flag='n'), protocol=pickle.HIGHEST_PROTOCOL) for k,v in categories.items(): s["c=" + k] = v for k, v in review_dict.items(): s["r=" + k] = v for b in businesses_to_collect: s["b=" + b] = businesses[b] for t in terms_to_collect: s["t=" + t] = [(t,1.0)] s.close()
def __init__(self, csm): self.csm = csm filename = simbase.config.GetString( 'account-bridge-filename', 'account-bridge') self.dbm = semidbm.open(filename, 'c')
def __init__(self, filename, flag='c', protocol=None, writeback=False): import semidbm shelve.Shelf.__init__(self, semidbm.open(filename, flag), protocol, writeback)
def __init__(self, name): super(Database, self).__init__() try: self.db = semidbm.open(os.path.join(directory(), name), 'c') except NotADirectoryError: logger.error("Old database type encountered!")
def shelve_open_semidbm(filename, flag='c', protocol=None, writeback=False): import semidbm return shelve.Shelf(semidbm.open(filename, flag), protocol, writeback)
def __enter__(self): self._cache = semidbm.open(self._cache_dir, flag='c') return self._cache
def open(self): self.db = semidbm.open(self.path, 'c')
def open_db_file(self): return semidbm.open(os.path.join(self.tempdir, 'myfile.db'), 'c')
def kanwaout(self, out): dic = dbm.open(out, 'c') for (k, v) in self.records.items(): dic[k] = compress(dumps(v)) dic.close()
def __init__(self, city): self.city = city self.f = semidbm.open(city + '.db', flag='r') self.db = shelve.Shelf(self.f, protocol=pickle.HIGHEST_PROTOCOL)
def run_bench(N, db_tpl) -> Dict[str, Dict[str, float]]: batchsize = 1000 LMDBM_FILE = db_tpl.format("lmdbm") LMDBM_BATCH_FILE = db_tpl.format("lmdbm-batch") PYSOS_FILE = db_tpl.format("pysos") SQLITEDICT_FILE = db_tpl.format("sqlitedict") SQLITEDICT_BATCH_FILE = db_tpl.format("sqlitedict-batch") DBM_DUMB_FILE = db_tpl.format("dbm.dumb") DBM_GNU_FILE = db_tpl.format("dbm.gnu") SEMIDBM_FILE = db_tpl.format("semidbm") VEDIS_FILE = db_tpl.format("vedis") VEDIS_BATCH_FILE = db_tpl.format("vedis-batch") UNQLITE_FILE = db_tpl.format("unqlite") UNQLITE_BATCH_FILE = db_tpl.format("unqlite-batch") remove_lmdbm(LMDBM_FILE) remove_lmdbm(LMDBM_BATCH_FILE) with suppress(FileNotFoundError): os.unlink(PYSOS_FILE) with suppress(FileNotFoundError): os.unlink(SQLITEDICT_FILE) with suppress(FileNotFoundError): os.unlink(SQLITEDICT_BATCH_FILE) remove_dbm(DBM_DUMB_FILE) remove_semidbm(SEMIDBM_FILE) with suppress(FileNotFoundError): os.unlink(VEDIS_FILE) with suppress(FileNotFoundError): os.unlink(VEDIS_BATCH_FILE) with suppress(FileNotFoundError): os.unlink(UNQLITE_FILE) with suppress(FileNotFoundError): os.unlink(UNQLITE_BATCH_FILE) ret: DefaultDict[str, Dict[str, float]] = defaultdict(dict) # writes with MeasureTime() as t: with JsonLmdb.open(LMDBM_FILE, "c") as db: for k, v in data(N): db[k] = v ret["lmdbm"]["write"] = t.get() print("lmdbm write", N, t.get()) with MeasureTime() as t: with JsonLmdb.open(LMDBM_BATCH_FILE, "c") as db: for pairs in batch(data(N), batchsize): db.update(pairs) ret["lmdbm-batch"]["write"] = t.get() print("lmdbm-batch write", N, t.get()) with open(os.devnull, "w") as devnull: # mute annoying "free lines" output with redirect_stdout(devnull): with MeasureTime() as t: db = pysos.Dict(PYSOS_FILE) for k, v in data(N): db[k] = v db.close() ret["pysos"]["write"] = t.get() print("pysos write", N, t.get()) with MeasureTime() as t: with SqliteDict(SQLITEDICT_FILE, autocommit=True) as db: for k, v in data(N): db[k] = v ret["sqlitedict"]["write"] = t.get() print("sqlitedict write", N, t.get()) with MeasureTime() as t: with SqliteDict(SQLITEDICT_BATCH_FILE, autocommit=False) as db: for pairs in batch(data(N), batchsize): db.update(pairs) db.commit() ret["sqlitedict-batch"]["write"] = t.get() print("sqlitedict-batch write", N, t.get()) with MeasureTime() as t: with dbm.dumb.open(DBM_DUMB_FILE, "c") as db: for k, v in data(N): db[k] = json.dumps(v) ret["dbm.dumb"]["write"] = t.get() print("dbm.dumb write", N, t.get()) if gdbm: with MeasureTime() as t: with dbm.gnu.open(DBM_GNU_FILE, "c") as db: for k, v in data(N): db[k] = json.dumps(v) ret["dbm.gnu"]["write"] = t.get() print("dbm.gnu write", N, t.get()) with MeasureTime() as t: db = semidbm.open(SEMIDBM_FILE, "c") for k, v in data(N): db[k] = json.dumps(v) db.close() ret["semidbm"]["write"] = t.get() print("semidbm write", N, t.get()) with MeasureTime() as t: with Vedis(VEDIS_FILE) as db: for k, v in data(N): db[k] = json.dumps(v) ret["vedis"]["write"] = t.get() print("vedis write", N, t.get()) with MeasureTime() as t: with Vedis(VEDIS_BATCH_FILE) as db: for pairs in batch(data(N), batchsize): db.update({k: json.dumps(v) for k, v in pairs}) ret["vedis-batch"]["write"] = t.get() print("vedis-batch write", N, t.get()) with MeasureTime() as t: with UnQLite(UNQLITE_FILE) as db: for k, v in data(N): db[k] = json.dumps(v) ret["unqlite"]["write"] = t.get() print("unqlite write", N, t.get()) with MeasureTime() as t: with UnQLite(UNQLITE_BATCH_FILE) as db: for pairs in batch(data(N), batchsize): db.update({k: json.dumps(v) for k, v in pairs}) ret["unqlite-batch"]["write"] = t.get() print("unqlite-batch write", N, t.get()) # reads with MeasureTime() as t: with JsonLmdb.open(LMDBM_FILE, "r") as db: for k in allkeys(N): db[k] # ret["lmdbm"]["read"] = t.get() print("lmdbm cont read", N, t.get()) with MeasureTime() as t: with JsonLmdb.open(LMDBM_FILE, "r") as db: for k in randkeys(N, N): db[k] ret["lmdbm"]["read"] = t.get() print("lmdbm rand read", N, t.get()) with open(os.devnull, "w") as devnull: # mute annoying "free lines" output with redirect_stdout(devnull): with MeasureTime() as t: db = pysos.Dict(PYSOS_FILE) for k in randkeys(N, N): db[k] db.close() ret["pysos"]["read"] = t.get() print("pysos read", N, t.get()) with MeasureTime() as t: with SqliteDict(SQLITEDICT_FILE) as db: for k in randkeys(N, N): db[k] ret["sqlitedict"]["read"] = t.get() print("sqlitedict read", N, t.get()) with MeasureTime() as t: with dbm.dumb.open(DBM_DUMB_FILE, "r") as db: for k in randkeys(N, N): json.loads(db[k]) ret["dbm.dumb"]["read"] = t.get() print("dbm.dumb read", N, t.get()) if gdbm: with MeasureTime() as t: with dbm.gnu.open(DBM_GNU_FILE, "r") as db: for k in randkeys(N, N): json.loads(db[k]) ret["dbm.gnu"]["read"] = t.get() print("dbm.gnu read", N, t.get()) with MeasureTime() as t: db = semidbm.open(SEMIDBM_FILE, "r") for k in randkeys(N, N): json.loads(db[k]) db.close() ret["semidbm"]["read"] = t.get() print("semidbm read", N, t.get()) with MeasureTime() as t: with Vedis(VEDIS_FILE) as db: for k in randkeys(N, N): json.loads(db[k]) ret["vedis"]["read"] = t.get() print("vedis read", N, t.get()) with MeasureTime() as t: with UnQLite(UNQLITE_FILE) as db: for k in randkeys(N, N): json.loads(db[k]) ret["unqlite"]["read"] = t.get() print("unqlite read", N, t.get()) return ret
import re import pickle import shelve import semidbm import pandas from os.path import basename from collections import defaultdict from operator import itemgetter from math import sqrt NUM_TOPICS = 40 tokenize_regex = re.compile(r'[A-Za-z]+') s = shelve.Shelf(semidbm.open(city + '.db', flag='n'), protocol=pickle.HIGHEST_PROTOCOL) for b in pandas.read_json('Business.json').itertuples(): s["b=" + str(b.business_id)] = (b.name, [ cat for cat in b.categories if cat != 'Restaurants' ], b.stars) reviews_body = pandas.read_json('Urbana.json') reviews_by_term = defaultdict(list) term_frequencies = defaultdict(int) for r in reviews_body.itertuples(): for term in tokenize_regex.findall(r.text): reviews_by_term[term].append((r.review_id, 1)) term_frequencies[term] += 1