def write(self, g2wpath, w2gpath): print >> sys.stderr, 'Sorting...' grp2words = {} for (w, (n, poss)) in self._words.iteritems(): grp = '%s:%d' % ('+'.join(poss), n) if grp not in grp2words: grp2words[grp] = [] grp2words[grp].append(w) word2grp = {} r = sorted(grp2words.iteritems(), key=lambda (k, v): len(v), reverse=True) for (grp, words) in r: words.sort() for (n, w) in enumerate(words): word2grp[w] = (grp, n) print >> sys.stderr, ' Group: %r (%d)' % (grp, len(words)) print >> sys.stderr, 'Writing: %r' % g2wpath g2w = cdb.cdbmake(g2wpath, g2wpath + '.tmp') for (grp, words) in grp2words.iteritems(): g2w.add(grp, ' '.join(words)) g2w.finish() print >> sys.stderr, 'Writing: %r' % w2gpath w2g = cdb.cdbmake(w2gpath, w2gpath + '.tmp') for (word, (grp, n)) in word2grp.iteritems(): w2g.add(word, '%s,%d' % (grp, n)) for w in self.skip: w2g.add(w, ',0') w2g.finish() return
def write(self, g2wpath, w2gpath): print >>sys.stderr, 'Sorting...' grp2words = {} for (w, (n,poss)) in self._words.iteritems(): grp = '%s:%d' % ('+'.join(poss), n) if grp not in grp2words: grp2words[grp] = [] grp2words[grp].append(w) word2grp = {} r = sorted(grp2words.iteritems(), key=lambda (k,v):len(v), reverse=True) for (grp, words) in r: words.sort() for (n,w) in enumerate(words): word2grp[w] = (grp, n) print >>sys.stderr, ' Group: %r (%d)' % (grp, len(words)) print >>sys.stderr, 'Writing: %r' % g2wpath g2w = cdb.cdbmake(g2wpath, g2wpath+'.tmp') for (grp,words) in grp2words.iteritems(): g2w.add(grp, ' '.join(words)) g2w.finish() print >>sys.stderr, 'Writing: %r' % w2gpath w2g = cdb.cdbmake(w2gpath, w2gpath+'.tmp') for (word,(grp,n)) in word2grp.iteritems(): w2g.add(word, '%s,%d' % (grp,n)) for w in self.skip: w2g.add(w, ',0') w2g.finish() return
def cdbmake_true(f, a): import cdb c = cdb.cdbmake(f, f + ".tmp") for (k, v) in a.iteritems(): c.add(k, v) c.finish() return
def realSync(self): if self.modified: self.modified = False newDB = cdb.cdbmake(self.cdbName, self.cdbName + ".tmp") for key, value in iter(self.cdbObject.each, None): if key in self.delList: if key in self.addList: newDB.add(key, cPickle.dumps(self.addList[key], cPickle.HIGHEST_PROTOCOL)) del self.addList[key] elif key in self.addList: newDB.add(key, cPickle.dumps(self.addList[key], cPickle.HIGHEST_PROTOCOL)) del self.addList[key] else: newDB.add(key, value) self.closeCDB() for key, value in self.addList.iteritems(): newDB.add(key, cPickle.dumps(value, cPickle.HIGHEST_PROTOCOL)) newDB.finish() del newDB self.addList = {} self.delList = [] self.openCDB()
def cdbmake_true(f, a): import cdb c = cdb.cdbmake(f, f+".tmp") for (k,v) in a.iteritems(): c.add(k,v) c.finish() return
def realSync(self): if self.modified: self.modified = False newDB = cdb.cdbmake(self.cdbName, self.cdbName + ".tmp") for key, value in iter(self.cdbObject.each, None): if key in self.delList: if key in self.addList: newDB.add( key, cPickle.dumps(self.addList[key], cPickle.HIGHEST_PROTOCOL)) del self.addList[key] elif key in self.addList: newDB.add( key, cPickle.dumps(self.addList[key], cPickle.HIGHEST_PROTOCOL)) del self.addList[key] else: newDB.add(key, value) self.closeCDB() for key, value in self.addList.iteritems(): newDB.add(key, cPickle.dumps(value, cPickle.HIGHEST_PROTOCOL)) newDB.finish() del newDB self.addList = {} self.delList = [] self.openCDB()
def sync(self, force=False): if not self.db: return tmp = cdb.cdbmake(self.filename, self.tempfile) # Copy original r = self.cdb.each() while r: k, v = r dk = decode(k) if k not in self.db: tmp.add(*r) r = self.cdb.each() # Add new stuff for k, l in self.db.iteritems(): for v in l: try: tmp.add(k, v) except: print(k, v) raise tmp.finish() self.cdb = cdb.init(self.filename) self.db = {}
def writeCdbPages(self, filename, pageFromId): maker = cdb.cdbmake(filename, filename + ".tmp") s = struct.Struct("<l") for i in pageFromId: name = pageFromId[i]['name'] linkIds = pageFromId[i]['links'] projects = pageFromId[i]['projects'] buf = create_string_buffer(8 + (4 + len(linkIds) * 4) + (len(projects) * 4 * 2) + len(name)) # pack in the lengths of the links and projects sets offset = 0 struct.pack_into("<l", buf, offset, len(linkIds)) offset += 4 struct.pack_into("<l", buf, offset, len(projects)) offset += 4 # pack in the page class and importance struct.pack_into("<l", buf, offset, (pageFromId[i]['class'] << 8) | pageFromId[i]['importance']) offset += 4 # pack in the links for j in linkIds: struct.pack_into("<l", buf, offset, j) offset += 4 # pack in the projects for j in projects: struct.pack_into("<l", buf, offset, j) offset += 4 struct.pack_into("<l", buf, offset, (projects[j]['class'] << 8) | projects[j]['importance']) offset += 4 # pack in the name buf[offset:] = name maker.add(s.pack(i), buf) print "Added %d records to CDB %s (fd %d)" % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker)
def __setitem__(self, key, value): try: self.db.add( key, value) except: # cdb has two modes and if we're in the wrong mode, switch self.db = cdb.cdbmake(self.fn, self.fn + ".tmp") self.db.add( key, value )
def _create_new_cdb(self, arg): """ Create new name-mapping if it doesn't exist yet, call this under the name-mapping.lock. """ if not os.path.exists(self._name_db): maker = cdb.cdbmake(self._name_db, self._name_db + '.tmp') maker.finish()
def writeCdbNameFromId(self, filename, dictionary): maker = cdb.cdbmake(filename, filename + ".tmp") s = struct.Struct("<l") for i in dictionary: maker.add(s.pack(i), dictionary[i]['name']) print "Added %d records to CDB %s (fd %d)" % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker)
def test_reuse_cdb_make(self): cm = cdb.cdbmake('data', 'tmp') cm.add('foo', 'bar') cm.finish() self.assertRaises(cdb.error, cm.add, 'spam', 'eggs') self.assertRaises(cdb.error, cm.addmany, [('spam', 'eggs')]) self.assertRaises(cdb.error, cm.finish)
def __setitem__(self, key, value): try: self.db.add(key, value) except: # cdb has two modes and if we're in the wrong mode, switch self.db = cdb.cdbmake(self.fn, self.fn + ".tmp") self.db.add(key, value)
def clear(self): """Remove all entries from the dictionary.""" os.remove(self.filename) open(self.filename, "w").close() maker = cdb.cdbmake(self.filename, self.filename + ".tmp") maker.finish() del(maker) self.cdb = cdb.init(self.filename)
def writeCdbIdFromName(self, filename, dictionary): maker = cdb.cdbmake(filename, filename + ".tmp") s = struct.Struct("<l") for i in dictionary: # add key, value #print "added:", i, dictionary[i]['id'] maker.add(i, s.pack(dictionary[i]['id'])) print "Added %d records to CDB %s (fd %d)" % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker)
def generate_cdb_file(self, dstdir, filename, key, hosts=None, groups=None, users=None, host=None, guard=True): if guard: fn = os.path.join(dstdir, filename).encode('ascii', 'ignore') maker = cdb.cdbmake(fn, fn + '.tmp') for user in users: if user.is_not_retired(): # FIXME really? val = getattr(user, key) if val: maker.add(user.uid, val) maker.finish()
def generate_cdb_file(self, dstdir, filename, key, hosts=None, groups=None, users=None, host=None, guard=True): if guard: fn = os.path.join(dstdir, filename).encode('ascii', 'ignore') maker = cdb.cdbmake(fn, fn + '.tmp') for user in users: # TODO latest version of python-cdb can do bulk add if user.is_not_retired(): val = getattr(user, key) if val: maker.add(user.uid, val) maker.finish()
def cdb_write_proc (file_cdb,dict_aa): maker = cdb.cdbmake(file_cdb, file_cdb + ".tmp") # for key in dict_aa.keys(): unit = dict_aa[key] json_str = json.dumps (unit) maker.add(key,json_str) maker.finish() del(maker) # os.chmod (file_cdb,0777)
def setUp(self): self.temp_dir = mkdtemp() self.cdb_path = join(self.temp_dir, 'database.cdb') self.tmp_path = join(self.temp_dir, 'database.tmp') self.db = cdb.cdbmake(self.cdb_path.encode('utf-8'), self.tmp_path.encode('utf-8')) self.db.add('a', '1') self.db.add('a', '2') self.db.addmany([('b', '1'), ('c', '1')]) self.db.add('a', b'\x80')
def create_db(f, db_fname): '''Write out db of headers''' fh = open(f, 'r') fh_headers = (x.strip()[1:-2] for i, x in enumerate(fh) if not (i % 4)) db = cdb.cdbmake(db_fname, db_fname + '.tmp') for h in fh_headers: db.add(h, 'T') db.finish() del(db)
def update(self, values): """Add values to the dictionary.""" maker = cdb.cdbmake(self.filename, self.filename + ".tmp") for i in values: # add key,value maker.add(self._pack_key(i), self._pack_value(values[i])) print "Added %d records to CDB %s (fd %d)" \ % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker) self.cdb = cdb.init(self.filename)
def update_database(): fn = "mac_address_db" db = cdb.cdbmake("../lib/" + fn, "../lib/" + fn + ".tmp") with open("../lib/mac.txt", "r") as file: for line in file: line = line.split() mac = line[0] vendor = line[1] db.add(mac, vendor) db.finish()
def dumpcdb(cmap, cdbfile, verbose=1): m = cdb.cdbmake(cdbfile, cdbfile + '.tmp') if verbose: print >> stderr, 'Writing: %r...' % cdbfile for (k, v) in cmap.getall_attrs(): m.add('/' + k, repr(v)) for (code, cid) in cmap.getall_code2cid(): m.add('c' + code, pack('>L', cid)) for (cid, code) in cmap.getall_cid2code(): m.add('i' + pack('>L', cid), code) m.finish() return
def dodb(ds, dst): db = cdb.cdbmake(dst, dst + "tmp") for d in ds: k = d["id"] del d["id"] j = json.dumps(d) d["id"] = k e = RC6.encrypt(bytearray(j, "utf-8"), bytearray(seckey, "ascii")) v = array.array("B") v.fromlist(e) db.add(k, v) db.finish()
def make_indices(path): f = bz2.BZ2File(path.replace('.xml.bz2', '-index.txt.bz2')) id_path = '%s.ids' % path title_path = '%s.titles' % path offset_path = '%s.offsets' % path id_db = cdb.cdbmake(id_path, id_path + '.tmp') title_db = cdb.cdbmake(title_path, title_path + '.tmp') offset_db = cdb.cdbmake(offset_path, offset_path + '.tmp') def build(): for line in f: (bytes, id, title) = line[:-1].split(':', 2) id_db.add(id, title) title_db.add(title, id) offset_db.add(id, bytes) yield progress(build()) id_db.finish() title_db.finish() offset_db.finish()
def dumpcdb(cmap, cdbfile, verbose=1): m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') if verbose: print >>stderr, 'Writing: %r...' % cdbfile for (k,v) in cmap.getall_attrs(): m.add('/'+k, repr(v)) for (code,cid) in cmap.getall_code2cid(): m.add('c'+code, pack('>L',cid)) for (cid,code) in cmap.getall_cid2code(): m.add('i'+pack('>L',cid), code) m.finish() return
def __init__(self, filename): self.filename = filename self.tempfile = "%s.tmp" % filename self.db = {} try: self.cdb = cdb.init(self.filename) except cdb.error: d = cdb.cdbmake(self.filename, self.tempfile) d.finish() del d self.cdb = cdb.init(self.filename)
def create_db(f, db_fname): '''Write out db of headers''' fh = open(f, 'r') fh_headers = (x.strip()[1:-2] for i, x in enumerate(fh) if not (i % 4)) db = cdb.cdbmake(db_fname, db_fname + '.tmp') for h in fh_headers: db.add(h, 'T') db.finish() del (db)
def open( self ): self.word_db = tc.BDB() self.word_db.open( self._get_db_filepath( 'word' ), tc.BDBOREADER ) try: os.system( 'rm -rf ' + self.cdb_dir ) os.system( 'mkdir ' + self.cdb_dir ) except OSError: pass word_cdb_name = self._get_cdb_filepath( 'word' ) self.word_cdb = cdb.cdbmake( word_cdb_name, word_cdb_name + ".tmp" ) self.index_file = open( word_cdb_name.replace( 'word.cdb', 'word.index'), 'w' )
def generate_cdb_file(data, filename): """Generate a CDB file""" cache_dir = config.get('cache_dir', '/var/lib/baruwa/data') dest = os.path.join(cache_dir, 'db', filename) maker = cdbmake(dest, dest + ".tmp") for line in data: maker.add(line.key, line.value) maker.finish() del(maker) os.chmod(dest, 0640) uid = pwd.getpwnam("baruwa").pw_uid gid = grp.getgrnam("exim").gr_gid os.chown(dest, uid, gid)
def make_cdb_db(self): lib = [] for bl in self.blacklist_files: bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0])) bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0])) if bl[0] in self.categories: if not os.path.isfile(bl_cdb_file): cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp) f = open(bl[1], "r") for line in f: cdb_file.add(line.strip("\n"), "True") cdb_file.finish() lib.append(bl_cdb_file) self.cache = lib
def build_cdb(filename): """Build a cdb file from a text file.""" import cdb try: cdbname = filename + '.cdb' tempfile.tempdir = os.path.dirname(filename) tmpname = os.path.split(tempfile.mktemp())[1] cdb = cdb.cdbmake(cdbname, cdbname + '.' + tmpname) for line in file_to_list(filename): key, value = (line.split() + [''])[:1] cdb.add(key.lower(), value) cdb.finish() except: return False return True
def writeCdbPageProjects(self, filename, pageFromId): maker = cdb.cdbmake(filename, filename + ".tmp") s = struct.Struct("<l") for i in pageFromId: projects = pageFromId[i]['projects'] buf = create_string_buffer(len(projects) * 4 * 2) offset = 0 for j in projects: struct.pack_into("<l", buf, offset, j) offset += 4 struct.pack_into("<l", buf, offset, (projects[j]['class'] << 8) | projects[j]['importance']) offset += 4 maker.add(s.pack(i), buf) print "Added %d records to CDB %s (fd %d)" % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker)
def writeCdbPageLinks(self, filename, pageFromId): maker = cdb.cdbmake(filename, filename + ".tmp") s = struct.Struct("<l") for i in pageFromId: linkIds = pageFromId[i]['links'] buf = create_string_buffer(4 + len(linkIds) * 4) offset = 0 struct.pack_into("<l", buf, offset, (pageFromId[i]['class'] << 8) | pageFromId[i]['importance']) offset += 4 for j in linkIds: struct.pack_into("<l", buf, offset, j) offset += 4 maker.add(s.pack(i), buf) print "Added %d records to CDB %s (fd %d)" % (maker.numentries, maker.fn, maker.fd) maker.finish() del(maker)
def phase_1(pkt): if pkt.haslayer(Dot11): if pkt.type == 0 and pkt.subtype in (0, 2, 4): if pkt.addr2 not in clients: vendor_id = pkt.addr2[0:8] upper_case = str(vendor_id).upper() db_name = "mac_address_db" db = cdb.cdbmake("../lib/" + db_name, "../lib/"+ db_name + ".tmp") del db db = cdb.init("../lib/" + db_name) match = db.get(upper_case) print("{:<6s}{:>13}{:>12s}".format(str(len(clients) + 1), pkt.addr2, match)) clients.append(pkt.addr2) vendors.append(match)
def openCDB(self): prevmask = os.umask(0) if not os.path.exists(self.path): os.makedirs(self.path, 02775) os.chown(self.path, self.uid, self.gid) if not os.path.isfile(self.cdbName): maker = cdb.cdbmake(self.cdbName, self.cdbName + ".tmp") maker.finish() del maker os.chown(self.cdbName, self.uid, self.gid) os.chmod(self.cdbName, 0664) os.umask(prevmask) self.cdbObject = cdb.init(self.cdbName)
def dumpcdb(cmap, cdbfile, verbose=1): from struct import pack, unpack try: import cdb except ImportError: import pycdb as cdb m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') if verbose: print >>stderr, 'Writing: %r...' % cdbfile for (k,v) in cmap.getall_attrs(): m.add('/'+k, repr(v)) for (code,cid) in cmap.getall_code2cid(): m.add('c'+code, pack('>L',cid)) for (cid,code) in cmap.getall_cid2code(): m.add('i'+pack('>L',cid), code) m.finish() return
def _destroy_item_locked(self, item): c = cdb.init(self._name_db) maker = cdb.cdbmake(self._name_db + '.ndb', self._name_db + '.tmp') r = c.each() while r: i, v = r if v != item._fs_item_id: maker.add(i, v) r = c.each() maker.finish() filesys.rename(self._name_db + '.ndb', self._name_db) path = os.path.join(self._path, item._fs_item_id) try: shutil.rmtree(path) except OSError, err: raise CouldNotDestroyError("Could not destroy item '%r' [errno: %d]" % ( item.name, err.errno))
def write(self, outfile=""): if outfile.endswith(".cdb"): self.msg("Writing to CDB: %s..." % (outfile)) out = cdb.cdbmake(outfile, outfile+".tmp") for (w, poss) in self.dict.iteritems(): s = map(lambda pf:"%s:%s" % pf, poss.iteritems()) out.add(w, ",".join(s)) out.finish() else: self.msg("Writing to plaintext: %s..." % (outfile)) if outfile: fp = file(outfile, "w") else: fp = sys.stdout for (w, poss) in self.dict.iteritems(): s = map(lambda pf:"%s:%s" % pf, poss.iteritems()) fp.write(w+"\t"+",".join(s)+"\n") fp.close() return
def add(self, key, value): if self.record_counter % self.fetch == 0: proc = subprocess.Popen(['wc', '-c', self.tmpfile], stdout=subprocess.PIPE) size = proc.stdout.read().strip().split(' ')[0] if int(size) > self.limit_file_size: self.cdb.finish() del self.cdb self.num_of_cdbs += 1 dbnamei = "{}.{}".format(self.dbname, self.num_of_cdbs) print "processing {}".format(dbnamei) dbnamei_tmp = dbnamei + ".tmp" self.tmpfile = dbnamei_tmp self.cdb = cdb.cdbmake(dbnamei, dbnamei_tmp) self.record_counter = 0 # save head keys of each splitted cdbs filebase = os.path.basename(dbnamei) self.keymap.write(u"{} {}\n".format(key, filebase)) self.record_counter += 1 self.cdb.add(key.encode(self.encoding), value)
def build_cdb(filename): """Build a cdb file from a text file.""" import cdb try: cdbname = filename + '.cdb' tempfile.tempdir = os.path.dirname(filename) tmpname = os.path.split(tempfile.mktemp())[1] cdb = cdb.cdbmake(cdbname, cdbname + '.' + tmpname) for line in file_to_list(filename): linef = line.split() key = linef[0].lower() try: value = linef[1] except IndexError: value = '' cdb.add(key, value) cdb.finish() except: return 0 else: return 1
def create_db(f, db_fname): '''Write out db of headers''' if f.endswith('.gz'): fh = gzip.open(f, 'rb') else: fh = open(f, 'r') if self.fqtype[0] == 'Illumina1.4': fh_headers = (x.strip()[1:-2] for i, x in enumerate(fh) if not (i % 4)) elif self.fqtype[0] == 'Illumina1.8': fh_headers = (x.split(' ')[0][1:] for i, x in enumerate(fh) if not (i % 4)) elif self.fqtype[0] == 'IlluminaSRA': fh_headers = (x.split(' ')[1][:-3] for i, x in enumerate(fh) if not (i % 4)) else: sys.stderr.write('Header encoding not determined: %s\n' % self.fqtype[0]) db = cdb.cdbmake(db_fname, db_fname + '.tmp') for h in fh_headers: db.add(h, 'T') db.finish() del(db)
def __init__(self, dbname, keyMapFile, limit_file_size=LFS_DEFAULT, fetch=1000000, encoding='utf-8'): # the options. self.dbname = dbname # used by CDB_Reader to decide which cdb includes the query key self.keyMapFile = keyMapFile self.limit_file_size = limit_file_size # determines how often to check if current cdb size exceeds the limit self.fetch = fetch self.record_counter = 0 self.num_of_cdbs = 0 self.encoding = encoding dbname = "{}.{}".format(self.dbname, self.num_of_cdbs) print "processing {}".format(dbname) dbname_tmp = dbname + ".tmp" self.tmpfile = dbname_tmp self.cdb = cdb.cdbmake(dbname, dbname_tmp) dbdir = os.path.dirname(self.dbname) keyMapPath = "{}/{}".format(dbdir, keyMapFile) self.keymap = codecs.open(keyMapPath, 'w', self.encoding)
def _rename_item_locked(self, arg): item, newname = arg nn = newname.encode('utf-8') npath = os.path.join(self._path, item._fs_item_id, 'name') c = cdb.init(self._name_db) maker = cdb.cdbmake(self._name_db + '.ndb', self._name_db + '.tmp') r = c.each() while r: i, v = r if i == nn: raise ItemAlreadyExistsError("Target item '%r' already exists!" % newname) elif v == item._fs_item_id: maker.add(nn, v) else: maker.add(i, v) r = c.each() maker.finish() filesys.rename(self._name_db + '.ndb', self._name_db) nf = open(npath, mode='wb') nf.write(nn) nf.close()
def pack_tree(cdb_file, base_path): exclude_list = ['.svn',] version_map = {} cdb_maker = cdb.cdbmake(cdb_file, cdb_file + '.tmp') base_path = os.path.abspath(base_path) for (path, dir_list, file_list) in os.walk(base_path): for dirname in exclude_list: if dirname in dir_list: dir_list.remove(dirname) for filename in file_list: relative_dir = path[len(base_path):] if not relative_dir: relative_dir = '/' # print 'no relative_dir', path, filename absolute_path = os.path.join(path, filename) relative_path = os.path.join(relative_dir, filename) f = open(absolute_path) data = f.read() f.close() cdb_maker.add(relative_path, data) cdb_maker.finish()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--val-type', default='int') parser.add_argument('in_file') parser.add_argument('out_file') args = parser.parse_args() str_to_val, val_to_str = { 'int': (int, struct.Struct('<Q').pack), 'float': (float, struct.Struct('<f').pack), 'prych_hex': (hexstr_to_list, compress_hex_list) }[args.val_type] if args.out_file.endswith('.mcdb'): db = mcdb.make(args.out_file) else: db = cdb.cdbmake(args.out_file, args.out_file + '.tmp') with open(args.in_file, 'r') as f: f.seek(0, os.SEEK_END) pb = progressbar.ProgressBar(maxval=f.tell()) pb.start() f.seek(0) for l in f: k, v = l.strip().split(None, 1) v = val_to_str(str_to_val(v)) db.add(k, v) pb.update(f.tell()) pb.finish() db.finish()
def write_pairs(self, f1, f2): '''Parse through two paired files and only write if both pairs are present''' def intersect(a, b): '''Intesection between lists''' return list(set(a) & set(b)) def rm_files(patterns): '''Remove files using glob given as list of patterns''' import glob import os for p in patterns: files = glob.glob(p) if len(files) == 0: pass else: map(os.remove, files) def write_out(db_common, f, o): '''Write out reads''' if self.gz: fh = open(f, 'r') out = gzip.open(o + '.gz', 'wb') else: fh = open(f, 'r') out = open(o, 'w') written_count = 0 total_count = 0 for (title, sequence, quality) in FastqGeneralIterator(fh): total_count += 1 if db_common.has_key(title[:-2]): out.write('@%s\n%s\n+\n%s\n' % (title, sequence, quality)) written_count += 1 sys.stderr.write('%s: Total %i, Written %i (%.1f%%)\n' % (f, total_count, written_count, written_count / total_count * 100)) fh.close() out.close() def create_db(f, db_fname): '''Write out db of headers''' fh = open(f, 'r') fh_headers = (x.strip()[1:-2] for i, x in enumerate(fh) if not (i % 4)) db = cdb.cdbmake(db_fname, db_fname + '.tmp') for h in fh_headers: db.add(h, 'T') db.finish() del (db) ## get headers from both trimmed files ## # strip the /2 or /1 and grab only the headers # write in dbm to minimze memory usage # create db's (parallel) rand = ''.join( random.choice(string.ascii_uppercase + string.digits) for x in range(36)) db1_fname = 'db1_%s' % rand db2_fname = 'db2_%s' % rand jobs = [] p = multiprocessing.Process(target=create_db, args=( f1, db1_fname, )) p.start() jobs.append(p) p = multiprocessing.Process(target=create_db, args=( f2, db2_fname, )) p.start() jobs.append(p) # wait for jobs to finish for job in jobs: job.join() ## get headers that are in both trimmed files ## db1 = cdb.init(db1_fname) db2 = cdb.init(db2_fname) common = intersect(db1.keys(), db2.keys()) dbcommon_fname = 'dbcommon_%s' % rand db_common = cdb.cdbmake(dbcommon_fname, dbcommon_fname + '.tmp') for h in common: db_common.add(h, 'T') db_common.finish() del (db_common) ## get headers that are in only one trimmed file ## symdiff = set(db1.keys()).symmetric_difference(set(db2.keys())) dbdiff_fname = 'dbdiff_%s' % rand db_diff = cdb.cdbmake(dbdiff_fname, dbdiff_fname + '.tmp') for h in symdiff: db_diff.add(h, 'T') db_diff.finish() del (db_diff) ## open common db ## db_common = cdb.init(dbcommon_fname) jobs = [] p = multiprocessing.Process(target=write_out, args=(db_common, f1, self.o[0])) p.start() jobs.append(p) p = multiprocessing.Process(target=write_out, args=(db_common, f2, self.o[1])) p.start() jobs.append(p) ## open single db ## self.single = [self.o[0] + '.single', self.o[1] + '.single'] db_diff = cdb.init(dbdiff_fname) p = multiprocessing.Process(target=write_out, args=(db_diff, f1, self.single[0])) p.start() jobs.append(p) p = multiprocessing.Process(target=write_out, args=(db_diff, f2, self.single[1])) p.start() jobs.append(p) # wait for jobs to finish for job in jobs: job.join() rm_files([db1_fname, db2_fname, dbcommon_fname, dbdiff_fname, f1, f2])
def main(): reader_format = 'pb' delim = '\t' fields = [] key = None typename = "" pb2file = None pb2codec = None indextype = None indexer = None outfile = None tempfile = None fin = sys.stdin infile = '-' verbose = 0 opts, args = getopt.getopt(sys.argv[1:], 'R:F:d:p:k:i:o:t:m:v') for o, a in opts: if o == '-R': reader_format = a elif o == '-F': fields = a.split(',') elif o == '-d': delim = a elif o == '-p': pb2file = a elif o == '-k': key = a elif o == '-m': typename = a elif o == '-o': outfile = a elif o == '-t': tempfile = a elif o == '-i': indextype = a elif o == '-v': verbose += 1 if len(args): infile = shift(args) fin = file(infile) if key == None: raise Exception("missing key parameter, specify with -k") # create the indexer object if indextype == 'cdb': import cdb if not outfile: outfile = "%s-%s-%s.idx" % (infile, key, indextype) if not tempfile: tempfile = "%s.tmp" % outfile indexer = cdb.cdbmake(outfile, tempfile) elif indextype == None: raise Exception("missing index type parameter, specify with -i") # create the stream reader if reader_format == 'pb': import lwpb.stream import lwpb.codec pb2codec = lwpb.codec.MessageCodec(pb2file=pb2file, typename=typename) reader = lwpb.stream.StreamReader(fin, codec=pb2codec) elif reader_format == 'txt': import percent.stream import percent.codec txtcodec = percent.codec.PercentCodec(fields, delim) reader = percent.stream.PercentCodecReader(fin, txtcodec) else: raise Exception("bad reader format") # index all the records for record in reader: indexkey = str(record[key]) indexval = str(reader.current_offset) if verbose: print >> sys.stderr, indexkey indexer.add(indexkey, indexval) indexer.finish() return 0
def __init__(self): fd, self.fn = mkstemp('.cdb', dir=os.getcwd()) os.close(fd) self.maker = cdb.cdbmake(self.fn, self.fn + '.tmp')
import cdb TRIPLES_FILE_PATH = 'top_100000_triples.txt' OUTPUT_PROPERTIES_DB = 'top_100000_properties.cdb' OUTPUT_VALUES_DB = 'top_100000_values.cdb' properties_db = cdb.cdbmake(OUTPUT_PROPERTIES_DB, OUTPUT_PROPERTIES_DB + '.tmp') values_db = cdb.cdbmake(OUTPUT_VALUES_DB, OUTPUT_VALUES_DB + '.tmp') print "Loading DBPedia triples..." for triple in open(TRIPLES_FILE_PATH, 'r'): subject, prop, value = triple.split('|$|') properties_db.add(subject, prop) values_db.add(subject + '|$|' + prop, value.rstrip('\n')) print "Done" properties_db.finish() print "Properties cdb created" values_db.finish() print "Values cdb created"
#!/usr/bin/env python import zipfile import csv import cdb import sys csv.field_size_limit(sys.maxsize) if __name__ == '__main__': latlon = cdb.cdbmake('latlon.cdb', 'latlon.cdb.tmp') geonames = cdb.cdbmake('geonames.cdb', 'geonames.cdb.tmp') for fn in sys.argv[1:]: zf = zipfile.ZipFile(fn, 'r') for name in zf.namelist(): if name == 'readme.txt': continue info = zf.getinfo(name) print "Reading ", name, " ... ", info.file_size f = zf.open(name, 'r') try: reader = csv.reader(f, delimiter="\t") for row in reader: geonameid = row[0] name = row[1] lat = row[4] lon = row[5] cl = row[6] cc = row[8]
os.mkdir(ipath) done = True except OSError, err: if err.errno != errno.EEXIST: raise if cntr > 2 and not done and self._itemspace <= 2 ** 31: self._itemspace *= 2 cntr = 0 elif cntr > 20: # XXX: UnexpectedBackendError() that propagates to user? raise Exception('Item space full!') nn = item.name.encode('utf-8') c = cdb.init(self._name_db) maker = cdb.cdbmake(self._name_db + '.ndb', self._name_db + '.tmp') r = c.each() while r: i, v = r if i == nn: # Oops. This item already exists! Clean up and error out. maker.finish() os.unlink(self._name_db + '.ndb') os.rmdir(ipath) if newrev is not None: os.unlink(newrev) raise ItemAlreadyExistsError("Item '%r' already exists!" % item.name) else: maker.add(i, v) r = c.each() maker.add(nn, itemid)
def __init__(self, filename): self.db = cdb.cdbmake(filename, filename + '.tmp')