def test_error_conditions(self): # Try to open a non-existent database. unlink(filename) self.assertRaises(gdbm.error, gdbm.open, filename, 'r') # Try to access a closed database. self.g = gdbm.open(filename, 'c') self.g.close() self.assertRaises(gdbm.error, lambda: self.g['a']) # try pass an invalid open flag self.assertRaises(gdbm.error, lambda: gdbm.open(filename, 'rx').close())
def test_flags(self): # Test the flag parameter open() by trying all supported flag modes. all = set(gdbm.open_flags) # Test standard flags (presumably "crwn"). modes = all - set('fsu') for mode in modes: self.g = gdbm.open(filename, mode) self.g.close() # Test additional flags (presumably "fsu"). flags = all - set('crwn') for mode in modes: for flag in flags: self.g = gdbm.open(filename, mode + flag) self.g.close()
def _setup(self): if os.path.exists(self._path): try: self._metadata = pickle.load(open('%s/%s' % (self._path, _METADATA_FILE), 'rb')) print('loaded metadata: %s' % repr(self._metadata)) logging.debug('loaded metadata %s' % repr(self._metadata)) except IOError: print("IO error loading metadata?") self._setup_metadata() dbses = _load_paths(self._path) for db in dbses: try: self._data.append(gdbm.open(db, 'c')) except Exception as err: print('error appending dbfile: %s' % db, err) print('loaded %i dbm files' % len(self._data)) else: print('path not found, creating') os.makedirs(self._path) os.makedirs('%s/archive' % self._path) self._setup_metadata() if not len(self._data): self._add_db()
def find_dups(directory='.', files='*.jpg', callbacks=[]): '''Given a ``directory``, goes through all files that pass through the filter ``files``, and for each one that is a duplicate, calls a number of ``callbacks``. Returns a dictionary containing the duplicates found. Example usage:: d = find_dups('some/directory', callbacks=[print_dups, KeepLarger()]) The signature for writing callbacks is (existing, dup, m), where ``existing`` and ``dup`` are paths and ``m`` is the FileExistenceManager instance. ''' from pathlib import Path store = GdbmStorageStrategy() m = FileExistenceManager(store) dups = {} for p in Path(directory).glob(files): with open(str(p), 'rb') as stream: existing = m.try_add_file(stream, str(p)) if existing: existing = existing.decode('utf-8') dups[str(p)] = existing for function in callbacks: function(Path(existing), p, m) m.close() return dups
def create_db(self): self.lock_db() try: with gdbm.open(self.filename, 'c'): os.chmod(self.filename, 0o600) finally: self.unlock_db()
def main(): db_file = sys.argv[1] done_db = shelve.Shelf(db.open(db_file, "c")) log.info('db has %d records', len(done_db)) added = 0 try: for csv_file in sys.argv[2:]: log.info("processing file %s", csv_file) reader = csv.DictReader(open(csv_file)) loaded_files = {} for row in reader: file_name = row['file'].strip() if file_name: loaded_files[row['id']] = row log.info('csv file has %d valid records ', len(loaded_files)) for idx in loaded_files: if idx not in done_db: done_db[idx] = loaded_files[idx] added += 1 log.info('Done - %d added - db now has %d records', added, len(done_db)) finally: done_db.close()
def __init__(self, dbm_file='./warcprox-playback-index.db'): if os.path.exists(dbm_file): self.logger.info('opening existing playback index database {}'.format(dbm_file)) else: self.logger.info('creating new playback index database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c')
def __init__(self,filename, size=None, cached=True): self.db=shelve.Shelf(db.open(filename, "c"), writeback=cached) self._init_indexes() if not size: self.limit=MAXINT else: self.limit=size
def __init__(self, filename): self.db = None if filename.endswith(".db"): try: self.db = gdbm.open(filename, "r") except gdbm.error as err: print("Unable to open binary database %s: %s" % (filename, err), file=sys.stderr)
def main(): db_file=sys.argv[1] done_db= shelve.Shelf(db.open(db_file, "c")) log.info ('db has %d records', len(done_db)) added=0 try: for csv_file in sys.argv[2:]: log.info("processing file %s", csv_file) reader=csv.DictReader(open(csv_file)) loaded_files={} for row in reader: file_name=row['file'].strip() if file_name: loaded_files[row['id']]=row log.info('csv file has %d valid records ', len(loaded_files)) for idx in loaded_files: if idx not in done_db: done_db[idx]=loaded_files[idx] added+=1 log.info('Done - %d added - db now has %d records', added, len(done_db)) finally: done_db.close()
def Create(self, infile, outfile): """Build the database from file.""" db = gdbm.open(outfile, "n") with open(infile) as fid: db["datafile"] = os.path.abspath(infile) while True: line = fid.readline() if not line or not len(line): break if line[:3] == "ID ": id = string.split(line)[1] start = fid.tell() - len(line) elif line[:3] == "AC ": acc = string.split(line)[1] if acc[-1] == ";": acc = acc[:-1] elif line[:2] == "//": stop = fid.tell() try: value = "%d %d" % (start, stop) db[id] = value db[acc] = value id, acc, start, stop = None, None, None, None except Exception: print("AARRGGGG %d %d %s %s" % (start, stop, type(start), type(stop))) print("%s %s" % (id, acc)) db.close()
def __init__(self, dbm_file='./warcprox-dedup.db'): if os.path.exists(dbm_file): self.logger.info('opening existing deduplication database {}'.format(dbm_file)) else: self.logger.info('creating new deduplication database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c')
def find_dups(path='./file_hashes.gdbm', directory='.', callbacks=[print_dup], filter=lambda path: True): """Like ``check_dups()``, but also updates the database as it goes. Given a ``directory``, goes through all files that pass through the predicate ``filter``, and for each one that is a duplicate, calls the of ``callbacks``. Returns a dictionary containing the duplicates found. Example usage:: d = find_dups(directory='some/directory', callbacks=[print_dup, KeepLarger()]) The signature for writing callbacks is ``(original, dup, m)``, where ``original`` and ``dup`` are Path instances and ``m`` is the FileExistenceManager instance. """ store = GdbmStorageStrategy(path=path) m = FileExistenceManager(store) dups = {} for p in Path(directory).walk(): if not p.is_file(): continue with open(str(p), 'rb') as stream: original = m.try_add_file(stream, str(p)) if original: original = original.decode('utf-8') dups[str(p)] = original for function in callbacks: function(Path(original), p, m) m.close() return dups
def __init__(self, dbname, hashfact): super().__init__() self.basepath = dbname # directory self.hashfact = hashfact # hash factory # Ensure path exists. path = pathlib.Path(self.basepath) try: path.mkdir(parents=True) # exist_ok=True is only for Python 3.5+ except FileExistsError: pass # Are there any databases yet? If not, create one. if len(list(path.iterdir())) == 0: e = self._create_db(0, hashfact.min(), hashfact.max()) e["db"].close() # will reopen later # Open all databases in order starting from the highest level. self.dbs = [] for d in sorted(path.iterdir(), reverse=True): print(d) db = dbm.open(str(d), "cuf") e = {} e["db"] = db e["filename"] = db["filename"] e["level"] = db["level"] e["minhash"] = db["minhash"] e["maxhash"] = db["maxhash"] self.dbs.append(e)
def Open(self, indexfile=None): """Open the indexed database file.""" if not indexfile: indexfile = os.path.join(os.environ["PYPHY"], "nr.dat.indexed") self.db = gdbm.open(indexfile) self.datafile = self.db["datafile"] self.fid = open(self.datafile)
def __init__(self, dbm_file='./warcprox-playback-index.db'): if os.path.exists(dbm_file): self.logger.info( 'opening existing playback index database {}'.format(dbm_file)) else: self.logger.info( 'creating new playback index database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c')
def _add_db(self): filename = 'mdbm%s.db' % time.strftime("%b%d%H%M%Y") # filename = 'mdbm%s.db' % str(time.time()) path = self._path + '/%s' % filename db = gdbm.open(path, 'c') db[_PATHKEY] = filename self._data.append(db) self._metadata['cursize'] = 0 logging.debug('mdbm added new dbm file: %s' % filename)
def rand_dbm_iter(dbmfile, seed=None): random.seed(seed) with gdbm.open(dbmfile,'w') as db: print('Loaded dbmfile ({}) with {} keys'.format(dbmfile, len(db))) while len(db) > 0: key = random.choice(db.keys()) val = db[key].decode() del db[key] yield val
def keys(self): self.lock_db() try: with gdbm.open(self.filename, 'r') as db: key = db.firstkey() while key is not None: yield key key = db.nextkey(key) finally: self.unlock_db()
def __init__(self, name): super(CoverArtExtDB._impl, self).__init__() self.cachedir = RB.user_cache_dir() + "/" + name if not os.path.exists(self.cachedir): os.makedirs(self.cachedir) filename = self.cachedir + "/store.db" self.db = gdbm.open(filename, 'c') self.queue = Queue() self._store_request_in_progress = False
def __call__(self, existing, dup, m): if self.dups_dir is None: self.dups_dir = dup.parent / 'dups' if dup.stat().st_size > existing.stat().st_size: # Keep *dup* since it is the larger file existing.rename(self.dups_dir / dup.name) # Move the old file with open(dup, 'rb') as stream: # Update the database m.add_or_replace_file(stream, str(dup)) else: # Move *dup* since it is the shorter file dup.rename(self.dups_dir / dup.name)
def getArticles(verbose): with gdbm.open("/var/cache/man/index.db", "r") as IndexDb: IndexDb = IndexDb.keys() if verbose: print("contents:", IndexDb) Articles = [] for Article in IndexDb: Articles.append(bytes(Article).decode()[:-1]) if verbose: print(Articles) return Articles
def rand_fits_iter(topdir, dbmfile='kwhistos.dbm', seed=None): save_dblist(topdir, dbmfile) random.seed(seed) with gdbm.open(dbmfile,'w') as db: print('Loaded dbmfile ({}) with {} filenames'.format(dbmfile, len(db))) while len(db) > 0: idx = random.choice(db.keys()) fname = db[idx].decode() #!print('DBG-1: db[{}]={}'.format(idx, fname)) del db[idx] yield fname
def _init(self, maxsize): #TODO: Change - this will not work with dummy dbm if not self.resume: for ext in ('', '.bak', '.dat', '.dir'): try: os.remove(self.finame+ext) except: pass self.queue = PersistentFIFO(self.filename) self.unfinished=shelve.Shelf(db.open(self.filename+'.unfinished', "c")) self.stop=False
def __init__(self, name="",dir=None, loglevel=logging.ERROR): self.name = name self.name_write = name + "__write" self.name_read = name + "__read" self.namefile = self.name + "_gunQueue.db" self.log = logging self.log.basicConfig(level=loglevel, format=self.__class__.__name__ + " %(asctime)s - %(levelname)s - - %(message)s", datefmt="%Y-%m/%d %H:%M:%S %p") if not dir == None: self.dir = dir if not os.path.exists(self.dir): os.mkdir(self.dir) self.index = gnu.open(self.dir + "/" + name + "_index.db", "c") self.queue = gnu.open(self.dir + "/" + self.namefile, "c") else: os.mkdir("queue_data") self.index = gnu.open("queue_data" + "/" + name + "_index.db", "c") self.queue = gnu.open("queue_data" + "/" + self.namefile, "c") self.write_id = self.index.get(self.name_write, b"1").decode() self.read_id = self.index.get(self.name_read, b"1").decode()
def delete_transferrable_key(self, key): if self.read_only: raise TypeError self.lock_db() try: with gdbm.open(self.filename, 'w') as db: if key.fingerprint not in db: raise KeyError(key.fingerprint) gdbm.reorganize() del db[key.fingerprint] finally: self.unlock_db()
def _shell_lookup(args): """This function is called when the script is used from command line: [jakni@nissen scripts]$ python unifetch.py -a A6XGL2 -ncis Name: A6XGL2_HUMAN Data class: Unreviewed TaxID: 9606 Sequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRRE [ ... ] """ with _gnu.open(args.database) as database: data = database.get(args.accession, None) # If no accession is found, return "Not found." if data is None: return 'Not found.' fields = {'Name': [args.name], 'Date': [args.date], 'Data class': [args.dataclass], 'Organism': [args.organism], 'Taxonomy': [args.taxonomy], 'TaxID': [args.taxid], 'Sequence': [args.sequence] } # If nothing particular is specified, return the entire accession if not any(arr[0] for arr in fields.values()): text = _gzip.decompress(data).decode() return text else: # If output specified, return the relevant parts. fileobject = _io.BytesIO(_gzip.decompress(data)) record = _SwissProt.read(fileobject) fields['Name'].append(record.entry_name) fields['Date'].append(record.created[0]) fields['Data class'].append(record.data_class) fields['Organism'].append(record.organism) species = get_species(record) fields['Taxonomy'].append( ';'.join(record.organism_classification + ([species] if species else []))) fields['TaxID'].append(';'.join(record.taxonomy_id)) fields['Sequence'].append(record.sequence) output = list() for title, (state, information) in fields.items(): if state: output.append('{}: {}'.format(title, information)) return '\n'.join(output)
def add_transferrable_key(self, key): if self.read_only: raise TypeError self.lock_db() try: with gdbm.open(self.filename, 'w') as db: if key.fingerprint in db: raise KeyError(key.fingerprint) db[key.fingerprint] = \ b''.join(map(bytes, key.to_packets( self._preferred_header_format))) finally: self.unlock_db()
def gdbm_test_db(request): print("creating test gdbm file") temp_file = tempfile.NamedTemporaryFile() test_db = gdbm.open(temp_file.name, "n") test_db[key1] = val1 test_db[key2] = val2 test_db.close() def delete_gdbm_test_db(): print("deleting test gdbm file") temp_file.close() request.addfinalizer(delete_gdbm_test_db) return temp_file.name
def __init__(self, dbm_file='./warcprox-playback-index.db'): try: import dbm.gnu as dbm_gnu except ImportError: try: import gdbm as dbm_gnu except ImportError: import anydbm as dbm_gnu if os.path.exists(dbm_file): self.logger.info('opening existing playback index database {}'.format(dbm_file)) else: self.logger.info('creating new playback index database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c')
def test_reorganize(self): self.g = gdbm.open(filename, 'c') size0 = os.path.getsize(filename) self.g['x'] = 'x' * 10000 size1 = os.path.getsize(filename) self.assert_(size0 < size1) del self.g['x'] # 'size' is supposed to be the same even after deleting an entry. self.assertEqual(os.path.getsize(filename), size1) self.g.reorganize() size2 = os.path.getsize(filename) self.assert_(size1 > size2 >= size0)
def populate_db(path='./file_hashes.gdbm', directory=".", callbacks=[print_dup], filter=lambda path: True): """Create/update database at ``path`` by hashing files in ``directory``.""" store = GdbmStorageStrategy(path=path) m = FileExistenceManager(store) for p in Path(directory).walk(): if not p.is_file(): continue with open(str(p), 'rb') as stream: original = m.try_add_file(stream, str(p)) if original: original = original.decode('utf-8') for function in callbacks: function(Path(original), p, m) m.close()
def open_dbm(self): if self.read_only: open_mode = "r" else: if os.path.exists(self.dbm_path): open_mode = "w" else: open_mode = "c" if os.name != "nt": open_mode += "s" self.logger.info("open dbm file {} with mode: {}".format( self.dbm_path, open_mode)) self.saved_file_params = gdbm.open(self.dbm_path, open_mode) if open_mode[0] == "w" or open_mode[0] == "r": self.stats = json.loads( self.saved_file_params.get(TFileStorage.stats_key))
def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()): try: import dbm.gnu as dbm_gnu except ImportError: try: import gdbm as dbm_gnu except ImportError: import anydbm as dbm_gnu if os.path.exists(dbm_file): self.logger.info('opening existing stats database {}'.format(dbm_file)) else: self.logger.info('creating new stats database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c') self.options = options
def gdbm_test_db(request): temp_file = tempfile.NamedTemporaryFile(delete=False) print("creating test gdbm file {}".format(temp_file.name)) test_db = gdbm.open(temp_file.name, "n") test_db[key1] = val1 test_db[key2] = val2 test_db.close() def delete_gdbm_test_db(): temp_file.close() for f in glob.glob("{}*".format(temp_file.name)): print("deleting test gdbm file {}".format(f)) os.remove(f) request.addfinalizer(delete_gdbm_test_db) return temp_file.name
def test_key_methods(self): self.g = gdbm.open(filename, 'c') self.assertEqual(self.g.keys(), []) self.g['a'] = 'b' self.g['12345678910'] = '019237410982340912840198242' self.g[b'bytes'] = b'data' key_set = set(self.g.keys()) self.assertEqual(key_set, set([b'a', b'bytes', b'12345678910'])) self.assert_(b'a' in self.g) self.assertEqual(self.g[b'bytes'], b'data') key = self.g.firstkey() while key: self.assert_(key in key_set) key_set.remove(key) key = self.g.nextkey(key) self.assertRaises(KeyError, lambda: self.g['xxx'])
def save_dblist(topdir, dbmfile, progcnt=1E4, #progcnt=10, expectedcnt = 84E5): idx = 0 tic() with gdbm.open(dbmfile,'nf') as db: for fname in fits_iter(topdir): db[str(idx)] = fname idx += 1 if (progcnt != None) and (idx % progcnt) == 0: secs = toc() remhrs = ((secs * expectedcnt / idx) - secs) / 60 / 60 print('# Saved {:,} to dbm in {:,.0f} secs. Remain hrs: {}' .format(idx, secs, remhrs)) save_dblist.count = idx return idx
def getReportXML(encounter): url = "http://appsrv.alleghenycounty.us/reports/rwservlet?food_rep&report=FoodINSP/insp_summary_COVID.jsp&desformat=XML&P_ENCOUNTER=%s" % encounter dbm = gdbm.open("reportDB.gdbm", 'c') try: xmlString = dbm[encounter] print("found %s" % encounter) except: try: print("getting %s" % encounter) xmlString = wget(url) dbm[encounter] = xmlString except: return None dbm.close() return xmlString
def _create_db(self, level, minhash, maxhash): #print("create_db: level =", level, "minhash =", minhash, " maxhash =", maxhash) level = str(level) filename = self.basepath + "/" + level + "-" + self.hashfact.hexify( minhash)[0:4] + "-" + self.hashfact.hexify(maxhash)[0:4] print("filename =", filename) e = {} db = dbm.open(filename, "cuf") db["filename"] = filename db["level"] = level db["minhash"] = minhash db["maxhash"] = maxhash e["db"] = db e["filename"] = db["filename"] e["level"] = db["level"] e["minhash"] = db["minhash"] e["maxhash"] = db["maxhash"] return e
def createdb(outfilepath, infilepath): """Creates a new database from a SwissProt/UniProt text file, gzipped or not. For speed, database is built in memory, then moved to disk. Takes ~11 hrs.""" import shutil as _shutil if _os.path.exists(outfilepath): raise FileExistsError('Database already exists.') # Check whether the database is gzipped or not by searching for the two # signature bytes 1F8B and use gzip.open if it is. with open(infilepath, 'rb') as infile: signature = infile.read(2) if signature == b'\x1f\x8b': opener = _gzip.open else: opener = open # Read the content of the text file. At accession identifier, extract accession. # at end of record, save the current record under extracted accession ID. # Create a database in memory. accession = None buffer = list() tempfilename = '/dev/shm/temp.gdbm' with opener(infilepath, 'rt') as infile, _gnu.open(tempfilename, 'cf') as db: for line in infile: buffer.append(line) if line.startswith('//'): assert accession is not None db[accession] = _gzip.compress(bytes(''.join(buffer), 'ASCII')) buffer.clear() accession = None elif line.startswith('AC') and accession is None: accession = line.split()[1][:-1] # Because I openened the database in fast mode, I need to sync before closing. db.sync() # Move file from memory to actual file location _shutil.move(tempfilename, outfilepath)
def __init__(self, *args, **config): super(database,self).__init__(*args, **config) default_db = config.get("dbtype","anydbm") if not default_db.startswith("."): default_db = '.' + default_db self._db_path = os.path.join(self.location, fs_template.gen_label(self.location, self.label)+default_db) self.__db = None mode = "w" if whichdb(self._db_path) in ("dbm.gnu", "gdbm"): # Allow multiple concurrent writers (see bug #53607). mode += "u" try: # dbm.open() will not work with bytes in python-3.1: # TypeError: can't concat bytes to str self.__db = anydbm_module.open(self._db_path, mode, self._perms) except anydbm_module.error: # XXX handle this at some point try: self._ensure_dirs() self._ensure_dirs(self._db_path) except (OSError, IOError) as e: raise cache_errors.InitializationError(self.__class__, e) # try again if failed try: if self.__db == None: # dbm.open() will not work with bytes in python-3.1: # TypeError: can't concat bytes to str if gdbm is None: self.__db = anydbm_module.open(self._db_path, "c", self._perms) else: # Prefer gdbm type if available, since it allows # multiple concurrent writers (see bug #53607). self.__db = gdbm.open(self._db_path, "cu", self._perms) except anydbm_module.error as e: raise cache_errors.InitializationError(self.__class__, e) self._ensure_access(self._db_path)
def verify_database(dbpath): db_files = _load_paths(dbpath) print("verifying %i mdbm chunks" % len(db_files)) for db in db_files: dbchunk = gdbm.open(db, 'w') # print("reorganizing %s" % db) # try: # dbchunk.reorganize() # except Exception as err: # print("couldn't reorganize: error %s" % err) print("checking %s" % db) try: # check_integrity_for_chunk(dbchunk) k = dbchunk.firstkey() print("first key: %s" % k) check_integrity_for_chunk(dbchunk) except Exception as err: print("integrity check failed: error %s" % err) finally: dbchunk.close()
def check_dups(path='./file_hashes.gdbm', directory=".", callbacks=[print_dup], filter=lambda path: True): """Check files in ``directory`` against the database ``path``. Example usage:: check_dups(directory='some/directory', callbacks=[print_dup, trash_dup]) """ store = GdbmStorageStrategy(path=path) m = FileExistenceManager(store) for p in Path(directory).walk(): if not p.is_file(): continue with open(str(p), 'rb') as stream: original = m.file_exists(stream) if original: original = original.decode('utf-8') for function in callbacks: function(Path(original), p, m) m.close()
def get_transferrable_key(self, fingerprint): if len(fingerprint) != 40: # Actually a key ID - find the fingerprint first. fingerprint = ([ k for k in self.keys() if k.endswith(fingerprint) ] + [None] )[0] if fingerprint is None: return None self.lock_db() try: with gdbm.open(self.filename, 'r') as db: packet_data = db[fingerprint] packets = list(parse_binary_packet_data(packet_data)) if packets: if packets[0].type == constants.PUBLIC_KEY_PACKET_TYPE: return TransferablePublicKey.from_packets(packets) elif packets[0].type == constants.SECRET_KEY_PACKET_TYPE: return TransferableSecretKey.from_packets(packets) finally: self.unlock_db()
def __init__(self, path='./file_hashes.gdbm', mode='c', sync='s'): from dbm.gnu import open self.d = open(path, mode + sync)
print ("%d : %s :%s"% (i,k, q.get(k)[0] if hasattr(q.get(k), '__len__') else q.get(k))) def print_items2(q): for i, k in enumerate(q): print ("%d : %s :%s"% (i,k, k[0])) if __name__ == '__main__': if len(sys.argv)<3: print("must provide data dir and queue size") sys.exit(1) base_dir=sys.argv[1] size= int(sys.argv[2]) filename=os.path.join(base_dir, 'pool_items') queue=PersistentFIFO(filename, size) unfinished=shelve.Shelf(db.open(filename+'.unfinished', "c")) print("Queue size %d" %len(queue)) print("Unfinished size %d" %len(unfinished)) if len(queue): print("Queue items:") print_items(queue.db) if len(unfinished): print("Unfinished items:") print_items(unfinished) queue.close()