Exemple #1
0
    def test_when_file_does_not_exist(self):
        path = os.path.join(self.tempdir, 'foo.db')
        db = semidbm.open(path, 'n')
        db['foo'] = 'bar'
        self.assertEqual(db['foo'], b'bar')
        db.close()

        # Opening the file again should basically blank out
        # any existing database.
        db = semidbm.open(path, 'n')
        self.assertEqual(list(db.keys()), [])
        db.close()
Exemple #2
0
 def test_checksum_failure(self):
     db = semidbm.open(self.dbdir, 'c')
     db['key'] = 'value'
     db.close()
     # Change the first digit of the checksum data.
     data_file = self.open_data_file(mode='r')
     # 3:key15:<checksum>value
     # First checksum digit is 9 bytes into the file.
     beginning = data_file.read()
     new_digit = int(beginning[8]) + 1
     data_file.close()
     data_file = self.open_data_file(mode='w')
     data_file.write(beginning[:8])
     data_file.write(str(new_digit))
     data_file.write(beginning[9:])
     data_file.close()
     db = self.open_db_file(verify_checksums=True)
     with self.assertRaises(semidbm.DBMChecksumError):
         db['key']
     # If checksums are not enabled, an exception is not raised.
     db = self.open_db_file(verify_checksums=False)
     try:
         db['key']
     except semidbm.DBMChecksumError:
         self.fail("Checksums were suppose to be disabled.")
    def __init__(self, csm):
        self.csm = csm

        # This uses dbm, so we open the DB file:
        filename = simbase.config.GetString('account-bridge-filename',
                                            'account-bridge')
        self.dbm = semidbm.open(filename, 'c')
Exemple #4
0
 def test_load_empty_db(self):
     db = semidbm.open(self.dbdir, 'c')
     db.close()
     empty_db = self.open_db_file()
     keys = empty_db.keys()
     empty_db.close()
     self.assertEqual(keys, [])
Exemple #5
0
    def test_key_does_not_exist(self):
        db = semidbm.open(self.dbdir, 'c')
        db['foo'] = 'bar'
        db.close()

        read_only = self.open_db_file()
        self.assertRaises(KeyError, read_only.__getitem__, 'bar')
        read_only.close()
Exemple #6
0
    def test_when_files_exist(self):
        db = self.open_db_file()
        db['foo'] = 'bar'
        db.close()

        db_write_mode = semidbm.open(self.dbdir, 'w')
        self.assertEqual(db_write_mode['foo'], b'bar')
        db_write_mode.close()
Exemple #7
0
 def test_open_read_multiple_times(self):
     db = semidbm.open(self.dbdir, 'c')
     db['foo'] = 'bar'
     db.close()
     # Open then close db immediately.
     db2 = self.open_db_file()
     db2.close()
     read_only = self.open_db_file()
     self.assertEqual(read_only['foo'], b'bar')
     read_only.close()
Exemple #8
0
 def test_open_read_multiple_times(self):
     db = semidbm.open(self.dbdir, 'c')
     db['foo'] = 'bar'
     db.close()
     # Open then close db immediately.
     db2 = self.open_db_file()
     db2.close()
     read_only = self.open_db_file()
     self.assertEqual(read_only['foo'], b'bar')
     read_only.close()
Exemple #9
0
    def create(self):
        """Create a new on-disk database.

        @raise anydbm.error: If there's a problem creating the database.
        """
        if self.filename:
            self.db = anydbm.open(self.filename, "n")  #raises anydbm.error
            self.db["--Reserved--type"] = self.type
            self.db.sync()
        else:
            self.db = {}
Exemple #10
0
    def test_can_read_items(self):
        db = semidbm.open(self.dbdir, 'c')
        db['foo'] = 'bar'
        db['bar'] = 'baz'
        db['baz'] = 'foo'
        db.close()

        read_only = self.open_db_file()
        self.assertEqual(read_only[b'foo'], b'bar')
        self.assertEqual(read_only[b'bar'], b'baz')
        self.assertEqual(read_only[b'baz'], b'foo')
        read_only.close()
Exemple #11
0
    def test_can_read_items(self):
        db = semidbm.open(self.dbdir, 'c')
        db['foo'] = 'bar'
        db['bar'] = 'baz'
        db['baz'] = 'foo'
        db.close()

        read_only = self.open_db_file()
        self.assertEqual(read_only[b'foo'], b'bar')
        self.assertEqual(read_only[b'bar'], b'baz')
        self.assertEqual(read_only[b'baz'], b'foo')
        read_only.close()
Exemple #12
0
 def __init__(self, dbdir, check_frequency=20, max_filesize=MAX_DISK_USAGE):
     import semidbm
     self._db = semidbm.open(dbdir, 'c')
     self._max_filesize = max_filesize
     # How frequently we check the file size of the cache.
     # If we check every 20 writes, then at worst case we overshoot
     # the max size by MAX_BODY_SIZE * check_frequency, or
     # about 20MB if we use the default values for everything.
     self._check_frequency = check_frequency
     self._counter = 0
     # When we reach the max disk size, we disable
     # writing data to the cache.
     self._writes_enabled = True
Exemple #13
0
    def open(self):
        """Open a pre-existing on-disk database.

        @raise anydbm.error: If there's a problem opening the database.
        @raise ValueError: If the database is not of the right type.
        """
        if not self.filename:
            raise ValueError("Can only open on-disk databases")
        self.db = anydbm.open(self.filename, "w")  #raises anydbm.error
        try:
            if self.db["--Reserved--type"] != self.type:
                raise ValueError("Not a %s database" % self.type)
        except KeyError:
            raise ValueError("Not a recognized database")
Exemple #14
0
 def test_checksum_failure(self):
     db = semidbm.open(self.dbdir, 'c')
     db[b'key'] = b'value'
     db.close()
     data_file = self.open_data_file(mode='rb')
     contents = data_file.read()
     data_file.close()
     # Changing 'value' to 'Value' should cause a checksum failure.
     contents = contents.replace(b'value', b'Value')
     data_file = self.open_data_file(mode='wb')
     data_file.write(contents)
     data_file.close()
     db = self.open_db_file(verify_checksums=True)
     with self.assertRaises(semidbm.DBMChecksumError):
         db['key']
     db.close()
     # If checksums are not enabled, an exception is not raised.
     db = self.open_db_file(verify_checksums=False)
     try:
         db['key']
     except semidbm.DBMChecksumError:
         self.fail("Checksums were suppose to be disabled.")
     finally:
         db.close()
Exemple #15
0
 def test_checksum_failure(self):
     db = semidbm.open(self.dbdir, 'c')
     db[b'key'] = b'value'
     db.close()
     data_file = self.open_data_file(mode='rb')
     contents = data_file.read()
     data_file.close()
     # Changing 'value' to 'Value' should cause a checksum failure.
     contents = contents.replace(b'value', b'Value')
     data_file = self.open_data_file(mode='wb')
     data_file.write(contents)
     data_file.close()
     db = self.open_db_file(verify_checksums=True)
     with self.assertRaises(semidbm.DBMChecksumError):
         db['key']
     db.close()
     # If checksums are not enabled, an exception is not raised.
     db = self.open_db_file(verify_checksums=False)
     try:
         db['key']
     except semidbm.DBMChecksumError:
         self.fail("Checksums were suppose to be disabled.")
     finally:
         db.close()
Exemple #16
0
 def __init__(self):
     if self._kanwadict is None:
         dictpath = resource_filename(__name__, 'kanwadict3.db')  # FIXME: no hardcoded filename
         self._kanwadict = dbm.open(dictpath, 'r')
Exemple #17
0
 def open_db_file(self, **kwargs):
     # If they do not explicitly set verify_checksums
     # to something, default to it being on.
     if 'verify_checksums' not in kwargs:
         kwargs['verify_checksums'] = True
     return semidbm.open(self.dbdir, 'c', **kwargs)
Exemple #18
0
 def reader(self):
     self.close()
     self._file = dbm.open(self._path, 'r')
     return self
Exemple #19
0
    def writer(self):
        """Return a new writer. Will always create a new file"""
        self.close()
        self._file = dbm.open(self._path, 'n')

        return self
Exemple #20
0
 def __init__(self, name):
     super(Database, self).__init__()
     self.db = semidbm.open(os.path.join(directory(), name), 'c')
Exemple #21
0
 def open_db_file(self, **kwargs):
     return semidbm.open(self.dbdir, 'r', **kwargs)
Exemple #22
0
 def reader(self):
     self.close()
     self._file = dbm.open(self._path, 'r')
     return self
Exemple #23
0
 def appender(self):
     """Return a new writer, preserving the file if it already exists"""
     self.close()
     self._file = dbm.open(self._path, 'c')
     return self
Exemple #24
0
 def open_db_file(self, **kwargs):
     return semidbm.open(self.dbdir, 'r', **kwargs)
Exemple #25
0
 def __init__(self, dbm_file):
     self._dbm = semidbm.open(dbm_file, 'r')
Exemple #26
0
	def __init__(self, dbm_file):
		self._dbm = semidbm.open(dbm_file, 'r')
Exemple #27
0
 def open(self):
     self._subcat_index = semidbm.open(self._subcat_index_file, 'c')
     self._supercat_index = semidbm.open(self._supercat_index_file, 'c')
     return self
Exemple #28
0
 def test_unicode_chars(self):
     db = semidbm.open(self.dbdir, 'c')
     # cafe with the e-accute.
     db[b'caf\xc3\xa9'] = b'caf\xc3\xa9'
     self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9')
     db.close()
Exemple #29
0
 def __init__(self):
     if self._kanwadict is None:
         dictpath = resource_filename(
             __name__, 'kanwadict3.db')  # FIXME: no hardcoded filename
         self._kanwadict = dbm.open(dictpath, 'r')
Exemple #30
0
 def open_db_file(self, **kwargs):
     # If they do not explicitly set verify_checksums
     # to something, default to it being on.
     if 'verify_checksums' not in kwargs:
         kwargs['verify_checksums'] = True
     return semidbm.open(self.dbdir, 'c', **kwargs)
Exemple #31
0
 def appender(self):
     """Return a new writer, preserving the file if it already exists"""
     self.close()
     self._file = dbm.open(self._path, 'c')
     return self
Exemple #32
0
 def shelve_open_semidbm(filename,
                         flag='c',
                         protocol=None,
                         writeback=False):
     import semidbm  # pylint: disable=import-error
     return shelve.Shelf(semidbm.open(filename, flag), protocol, writeback)
Exemple #33
0
    def writer(self):
        """Return a new writer. Will always create a new file"""
        self.close()
        self._file = dbm.open(self._path, 'n')

        return self
Exemple #34
0
# This file is the starting file for a rhyming program using NLTK.

import semidbm
import random
import time

from nltk.tokenize import RegexpTokenizer

# Open the rhyming database
syllablesDB = semidbm.open('words.db')
rhymesDB = semidbm.open('rhymes.db')


def rhyme(word, count):
    """Returns a list of all the words that rhyme with 'word' with 'count' number of syllables."""
    # start = time.time() #####
    try:
        wordSyllables = syllablesDB[word.upper()].decode().split()[0]
        # print wordSyllables ###
        wordRhymes = [word.decode() for word in rhymesDB[wordSyllables].split()]
        wordRhymes.remove(word.upper())
        # print wordRhymes ###
        backlist = [x.lower() for x in wordRhymes if count == 0 or syllablesDB[x].split()[1] == count]
    except:
        backlist = []
    # print 'rhyme: '+str(time.time() - start) #####
    return backlist


def rhymesWith(word1, word2):
    """Determines if two words rhyme."""
Exemple #35
0
 def test_unicode_chars(self):
     db = semidbm.open(self.dbdir, 'c')
     # cafe with the e-accute.
     db[b'caf\xc3\xa9'] = b'caf\xc3\xa9'
     self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9')
     db.close()
Exemple #36
0
 def __init__(self, filename, flag='c', protocol=pickle.HIGHEST_PROTOCOL):
     self._my_file = semidbm.open(filename, flag=flag)
     self._protocol = protocol
Exemple #37
0
 def __init__(self, db=None):
     if isinstance(db, str):
         self.db = semidbm.open(db, 'c')
     else:
         self.db = db
Exemple #38
0
 def __init__(self, conf=None):
     if conf is None:
         raise Exception('Path required.')
     if not os.path.exists(os.path.dirname(conf)):
         os.makedirs(os.path.dirname(conf))
     self._db = semidbm.open(conf, 'c')
Exemple #39
0
categories = collections.defaultdict(list)
review_dict = {}

terms_to_collect = set()
businesses_to_collect = set()

with open('Business.json') as f:
    businesses = {b['business_id'].encode('utf-8'): (b['name'].encode('utf-8'), [cat.encode('utf-8') for cat in b['categories'] if cat != 'Restaurants'], b['stars']) for b in json.load(f)}

with open(city + '.json') as reviews:
    reviews_list = json.load(reviews)
    for review in reviews_list:
        review_dict[review['review_id'].encode('utf-8')] = (review['text'].encode('utf-8'), review['business_id'].encode('utf-8'), review['stars'], review['date'].encode('utf-8'))
        terms = tokenize_regex.findall(review['text'].lower())
        if terms:
            review_weight = 1.0/len(terms)
            for term in terms:
                categories[term.encode('utf-8')].append((review['review_id'].encode('utf-8'), review_weight))
                terms_to_collect.add(term.encode('utf-8'))
                businesses_to_collect.add(review['business_id'].encode('utf-8'))

s = shelve.Shelf(semidbm.open(city + '-Baseline.db', flag='n'), protocol=pickle.HIGHEST_PROTOCOL)
for k,v in categories.items():
    s["c=" + k] = v
for k, v in review_dict.items():
    s["r=" + k] = v
for b in businesses_to_collect:
    s["b=" + b] = businesses[b]
for t in terms_to_collect:
    s["t=" + t] = [(t,1.0)]
s.close()
    def __init__(self, csm):
        self.csm = csm

        filename = simbase.config.GetString(
            'account-bridge-filename', 'account-bridge')
        self.dbm = semidbm.open(filename, 'c')
 def __init__(self, filename, flag='c', protocol=None, writeback=False):
     import semidbm
     shelve.Shelf.__init__(self, semidbm.open(filename, flag), protocol,
                           writeback)
Exemple #42
0
 def __init__(self, name):
     super(Database, self).__init__()
     try:
         self.db = semidbm.open(os.path.join(directory(), name), 'c')
     except NotADirectoryError:
         logger.error("Old database type encountered!")
Exemple #43
0
 def shelve_open_semidbm(filename,
                         flag='c',
                         protocol=None,
                         writeback=False):
     import semidbm
     return shelve.Shelf(semidbm.open(filename, flag), protocol, writeback)
Exemple #44
0
 def __enter__(self):
     self._cache = semidbm.open(self._cache_dir, flag='c')
     return self._cache
Exemple #45
0
 def open(self):
     self.db = semidbm.open(self.path, 'c')
Exemple #46
0
 def open_db_file(self):
     return semidbm.open(os.path.join(self.tempdir,
                                      'myfile.db'), 'c')
Exemple #47
0
 def kanwaout(self, out):
     dic = dbm.open(out, 'c')
     for (k, v) in self.records.items():
         dic[k] = compress(dumps(v))
     dic.close()
Exemple #48
0
 def __init__(self, city):
     self.city = city
     self.f = semidbm.open(city + '.db', flag='r')
     self.db = shelve.Shelf(self.f, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #49
0
 def kanwaout(self, out):
     dic = dbm.open(out, 'c')
     for (k, v) in self.records.items():
         dic[k] = compress(dumps(v))
     dic.close()
Exemple #50
0
def run_bench(N, db_tpl) -> Dict[str, Dict[str, float]]:

    batchsize = 1000

    LMDBM_FILE = db_tpl.format("lmdbm")
    LMDBM_BATCH_FILE = db_tpl.format("lmdbm-batch")
    PYSOS_FILE = db_tpl.format("pysos")
    SQLITEDICT_FILE = db_tpl.format("sqlitedict")
    SQLITEDICT_BATCH_FILE = db_tpl.format("sqlitedict-batch")
    DBM_DUMB_FILE = db_tpl.format("dbm.dumb")
    DBM_GNU_FILE = db_tpl.format("dbm.gnu")
    SEMIDBM_FILE = db_tpl.format("semidbm")
    VEDIS_FILE = db_tpl.format("vedis")
    VEDIS_BATCH_FILE = db_tpl.format("vedis-batch")
    UNQLITE_FILE = db_tpl.format("unqlite")
    UNQLITE_BATCH_FILE = db_tpl.format("unqlite-batch")

    remove_lmdbm(LMDBM_FILE)
    remove_lmdbm(LMDBM_BATCH_FILE)
    with suppress(FileNotFoundError):
        os.unlink(PYSOS_FILE)
    with suppress(FileNotFoundError):
        os.unlink(SQLITEDICT_FILE)
    with suppress(FileNotFoundError):
        os.unlink(SQLITEDICT_BATCH_FILE)
    remove_dbm(DBM_DUMB_FILE)
    remove_semidbm(SEMIDBM_FILE)
    with suppress(FileNotFoundError):
        os.unlink(VEDIS_FILE)
    with suppress(FileNotFoundError):
        os.unlink(VEDIS_BATCH_FILE)
    with suppress(FileNotFoundError):
        os.unlink(UNQLITE_FILE)
    with suppress(FileNotFoundError):
        os.unlink(UNQLITE_BATCH_FILE)

    ret: DefaultDict[str, Dict[str, float]] = defaultdict(dict)

    # writes

    with MeasureTime() as t:
        with JsonLmdb.open(LMDBM_FILE, "c") as db:
            for k, v in data(N):
                db[k] = v
    ret["lmdbm"]["write"] = t.get()
    print("lmdbm write", N, t.get())

    with MeasureTime() as t:
        with JsonLmdb.open(LMDBM_BATCH_FILE, "c") as db:
            for pairs in batch(data(N), batchsize):
                db.update(pairs)
    ret["lmdbm-batch"]["write"] = t.get()
    print("lmdbm-batch write", N, t.get())

    with open(os.devnull, "w") as devnull:  # mute annoying "free lines" output
        with redirect_stdout(devnull):
            with MeasureTime() as t:
                db = pysos.Dict(PYSOS_FILE)
                for k, v in data(N):
                    db[k] = v
                db.close()
    ret["pysos"]["write"] = t.get()
    print("pysos write", N, t.get())

    with MeasureTime() as t:
        with SqliteDict(SQLITEDICT_FILE, autocommit=True) as db:
            for k, v in data(N):
                db[k] = v
    ret["sqlitedict"]["write"] = t.get()
    print("sqlitedict write", N, t.get())

    with MeasureTime() as t:
        with SqliteDict(SQLITEDICT_BATCH_FILE, autocommit=False) as db:
            for pairs in batch(data(N), batchsize):
                db.update(pairs)
                db.commit()
    ret["sqlitedict-batch"]["write"] = t.get()
    print("sqlitedict-batch write", N, t.get())

    with MeasureTime() as t:
        with dbm.dumb.open(DBM_DUMB_FILE, "c") as db:
            for k, v in data(N):
                db[k] = json.dumps(v)
    ret["dbm.dumb"]["write"] = t.get()
    print("dbm.dumb write", N, t.get())

    if gdbm:
        with MeasureTime() as t:
            with dbm.gnu.open(DBM_GNU_FILE, "c") as db:
                for k, v in data(N):
                    db[k] = json.dumps(v)
        ret["dbm.gnu"]["write"] = t.get()
        print("dbm.gnu write", N, t.get())

    with MeasureTime() as t:
        db = semidbm.open(SEMIDBM_FILE, "c")
        for k, v in data(N):
            db[k] = json.dumps(v)
        db.close()
    ret["semidbm"]["write"] = t.get()
    print("semidbm write", N, t.get())

    with MeasureTime() as t:
        with Vedis(VEDIS_FILE) as db:
            for k, v in data(N):
                db[k] = json.dumps(v)
    ret["vedis"]["write"] = t.get()
    print("vedis write", N, t.get())

    with MeasureTime() as t:
        with Vedis(VEDIS_BATCH_FILE) as db:
            for pairs in batch(data(N), batchsize):
                db.update({k: json.dumps(v) for k, v in pairs})
    ret["vedis-batch"]["write"] = t.get()
    print("vedis-batch write", N, t.get())

    with MeasureTime() as t:
        with UnQLite(UNQLITE_FILE) as db:
            for k, v in data(N):
                db[k] = json.dumps(v)
    ret["unqlite"]["write"] = t.get()
    print("unqlite write", N, t.get())

    with MeasureTime() as t:
        with UnQLite(UNQLITE_BATCH_FILE) as db:
            for pairs in batch(data(N), batchsize):
                db.update({k: json.dumps(v) for k, v in pairs})
    ret["unqlite-batch"]["write"] = t.get()
    print("unqlite-batch write", N, t.get())

    # reads

    with MeasureTime() as t:
        with JsonLmdb.open(LMDBM_FILE, "r") as db:
            for k in allkeys(N):
                db[k]
    # ret["lmdbm"]["read"] = t.get()
    print("lmdbm cont read", N, t.get())

    with MeasureTime() as t:
        with JsonLmdb.open(LMDBM_FILE, "r") as db:
            for k in randkeys(N, N):
                db[k]
    ret["lmdbm"]["read"] = t.get()
    print("lmdbm rand read", N, t.get())

    with open(os.devnull, "w") as devnull:  # mute annoying "free lines" output
        with redirect_stdout(devnull):
            with MeasureTime() as t:
                db = pysos.Dict(PYSOS_FILE)
                for k in randkeys(N, N):
                    db[k]
                db.close()
    ret["pysos"]["read"] = t.get()
    print("pysos read", N, t.get())

    with MeasureTime() as t:
        with SqliteDict(SQLITEDICT_FILE) as db:
            for k in randkeys(N, N):
                db[k]
    ret["sqlitedict"]["read"] = t.get()
    print("sqlitedict read", N, t.get())

    with MeasureTime() as t:
        with dbm.dumb.open(DBM_DUMB_FILE, "r") as db:
            for k in randkeys(N, N):
                json.loads(db[k])
    ret["dbm.dumb"]["read"] = t.get()
    print("dbm.dumb read", N, t.get())

    if gdbm:
        with MeasureTime() as t:
            with dbm.gnu.open(DBM_GNU_FILE, "r") as db:
                for k in randkeys(N, N):
                    json.loads(db[k])
        ret["dbm.gnu"]["read"] = t.get()
        print("dbm.gnu read", N, t.get())

    with MeasureTime() as t:
        db = semidbm.open(SEMIDBM_FILE, "r")
        for k in randkeys(N, N):
            json.loads(db[k])
        db.close()
    ret["semidbm"]["read"] = t.get()
    print("semidbm read", N, t.get())

    with MeasureTime() as t:
        with Vedis(VEDIS_FILE) as db:
            for k in randkeys(N, N):
                json.loads(db[k])
    ret["vedis"]["read"] = t.get()
    print("vedis read", N, t.get())

    with MeasureTime() as t:
        with UnQLite(UNQLITE_FILE) as db:
            for k in randkeys(N, N):
                json.loads(db[k])
    ret["unqlite"]["read"] = t.get()
    print("unqlite read", N, t.get())

    return ret
    def __init__(self, csm):
        self.csm = csm

        filename = simbase.config.GetString(
            'account-bridge-filename', 'account-bridge')
        self.dbm = semidbm.open(filename, 'c')
Exemple #52
0
import re
import pickle
import shelve
import semidbm
import pandas
from os.path import basename
from collections import defaultdict
from operator import itemgetter
from math import sqrt

NUM_TOPICS = 40

tokenize_regex = re.compile(r'[A-Za-z]+')

s = shelve.Shelf(semidbm.open(city + '.db', flag='n'),
                 protocol=pickle.HIGHEST_PROTOCOL)
for b in pandas.read_json('Business.json').itertuples():
    s["b=" + str(b.business_id)] = (b.name, [
        cat for cat in b.categories if cat != 'Restaurants'
    ], b.stars)

reviews_body = pandas.read_json('Urbana.json')

reviews_by_term = defaultdict(list)
term_frequencies = defaultdict(int)
for r in reviews_body.itertuples():
    for term in tokenize_regex.findall(r.text):
        reviews_by_term[term].append((r.review_id, 1))
        term_frequencies[term] += 1