def kyotocabinet_fetch(limit): path = "/tmp/test_py_benchmark_%s.kch" % limit db = kyotocabinet.DB() flags = kyotocabinet.DB.OWRITER if not db.open(path, flags): return False for i in range(0, limit): k = str(random.randrange(0, limit - 1)) v = db.get(k) if len(v) < 1: return False db.close() return True
def kyotocabinet_store(limit): path = "/tmp/test_py_benchmark_%s.kch" % limit db = kyotocabinet.DB() flags = kyotocabinet.DB.OWRITER flags = flags | kyotocabinet.DB.OCREATE flags = flags | kyotocabinet.DB.OTRUNCATE flags = flags | kyotocabinet.DB.OTRYLOCK if not db.open(path, flags): return False for i in range(0, limit): k = str(i) v = str(random.randrange(0, 65535)) rv = db.set(k, v) if not rv: return False db.close() return True
def _create_index(self, surface_map_file, entity_list_file, surface_index_name, mid_offset_index_name): logging.info("Generating entities and surface index.") num_lines = 0 logger.info("Reading entity offsets.") mid_offsets = dict() # Remember the offset for each entity. with open(entity_list_file, 'r') as f: mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) offset = mm.tell() line = mm.readline() while line: num_lines += 1 if num_lines % 1000000 == 0: logger.info('Read %s lines' % num_lines) line = line.decode('utf-8') cols = line.split('\t') mid = cols[0] mid_offsets[mid] = offset offset = mm.tell() line = mm.readline() s_index_db = kyotocabinet.DB() s_index_db.open( surface_index_name + '#msiz=20000000000#bnum=200000000#opts=l') logging.info("Creating surface map on disk.") num_lines = 0 # We now write a list of (offset, score)... floats for # each surface form. num_not_found = 0 with open(surface_map_file, 'r') as f: last_surface_form = None surface_form_entries = array.array('d') for line in f: num_lines += 1 try: cols = line.decode('utf-8').split('\t') surface_form = cols[0] score = float(cols[1]) mid = cols[2].strip() offset = float(mid_offsets[mid]) if surface_form != last_surface_form: if surface_form_entries: s_index_db.set(last_surface_form, surface_form_entries.tostring()) last_surface_form = surface_form surface_form_entries = array.array('d') surface_form_entries.append(offset) surface_form_entries.append(score) except KeyError: num_not_found += 1 if num_not_found < 100: logger.warn( "Mid %s appears in surface map but not " "in entity list." % mid) elif num_not_found == 100: logger.warn( "Suppressing further warnings about unfound mids.") if num_lines % 1000000 == 0: logger.info( 'Stored %s surface-form->entity pairs.' % num_lines) if surface_form_entries: s_index_db.set(last_surface_form, surface_form_entries.tostring()) if num_not_found > 0: logger.warn( "%s entries of an mid in surface map but mid not " "in entity list." % num_not_found) # store an additional index from mid -> offset s_index_db.close() mid_offset_db = kyotocabinet.DB() mid_offset_db.open( mid_offset_index_name + '#msiz=20000000000#bnum=200000000#opts=l') logging.info("Creating entity offset index on disk.") for mid, offset in mid_offsets.iteritems(): mid_offset_db.set(mid, offset) logging.info("Done.") mid_offset_db.close()
def read_db(DB): """Load the kyotocabinet type database DB""" db = kyotocabinet.DB() if not db.open(DB, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE): sys.stderr.write('ERROR: failed to open: %s\n' % db.error()) return db
def __init__(self, path, parse=lambda v: v, unparse=lambda v: v): self.fk = kyoto.DB() if not self.fk.open(path, _OPEN_MODE): raise self.fk.error() self.parse = parse self.unparse = unparse
def __init__(self): self.db = kyotocabinet.DB() if not self.db.open(Settings.database_name): raise InvalidInputException(PollContainer.db_error)
def __init__(self, dbdir, mode=Mode.READONLY): """dbdir -> opens a descriptor storage >>> store = DescriptaStore(db) >>> len(store) # access the options used to create this store # (this is optional and may not exist) >>> store.options ... Iterate through molecule data ([moldata, <optional name>], descriptors) >>> for moldata, descriptors in store: >>> pass Iterate through only the descriptors >>> for i,prop in enumerate(store.descriptors()): >>> pass If name indexed: >>> row = store.lookupName("ZWIMER-03065") If inchi key index: Since inchi keys may collide, this can return multiple indices >>> rows = store.lookupInchiKey("BCWYEXBNOWJQJV-UHFFFAOYSA-N") """ self.desctiporDB = dbdir self.db = raw.RawStore(dbdir, mode=mode) self.index = MolFileIndex.MolFileIndex( os.path.join(dbdir, "__molindex__")) inchi = os.path.join(dbdir, "inchikey.kch") if os.path.exists(inchi): if not kyotocabinet: print( "Inchi lookup exists, but kyotocabinet is not installed.", file=sys.stderr) else: self.inchikey = kyotocabinet.DB() if mode == Mode.READONLY: self.inchikey.open(inchi, kyotocabinet.DB.OREADER) else: self.inchikey.open(inchi, kyotocabinet.DB.OWRITER) else: self.inchikey = None name = os.path.join(dbdir, "name.kch") if os.path.exists(name): if not kyotocabinet: logging.warning( "Name lookup exists, but kyotocabinet is not installed.") self.name = None else: self.name = kyotocabinet.DB() if mode == Mode.READONLY: self.name.open(name, kyotocabinet.DB.OREADER) else: self.name.open(name, kyotocabinet.DB.OWRITER) else: print("Couldn't open name db", name, file=sys.stderr) self.name = None self.options = None optionsfile = os.path.join(dbdir, "__options__") if os.path.exists(optionsfile): with open(optionsfile, 'rb') as f: self.options = pickle.load(f) # index the calculated flags datacols = [(i, name) for i, name in enumerate(self.db.colnames) if "_calculated" not in name] self.datanames = [name for i, name in datacols] self.dataindices = [i for i, name in datacols]
def __init__(self): self.__counter = 0 self.__db = kyotocabinet.DB() self.__fname = None
def __init__(self, kyoto_db): self.db = kyotocabinet.DB() if not self.db.open(kyoto_db, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE): print "cannot open db" sys.exit(2)
def make_store(options): while props: props.pop() props.append(MakeGenerator(options.descriptors.split(","))) properties = props[0] # to test molecule inchiKey = options.index_inchikey if inchiKey and not kyotocabinet: logging.warning( "Indexing inchikeys requires kyotocabinet, please install kyotocabinet" ) return False # make the storage directory if os.path.exists(options.storage): raise IOError("Directory for descriptastorus already exists: %s" % options.storage) # prepare the Pool if options.numprocs == -1: num_cpus = multiprocessing.cpu_count() else: # never use more than the maximum number num_cpus = min(int(options.numprocs), multiprocessing.cpu_count()) pool = multiprocessing.Pool(num_cpus) os.mkdir(options.storage) with open(os.path.join(options.storage, "__options__"), 'wb') as f: pickle.dump(vars(options), f) # index the molfile indexdir = os.path.join(options.storage, "__molindex__") sm = MolFileIndex.MakeSmilesIndex(options.smilesfile, indexdir, sep=options.seperator, hasHeader=options.hasHeader, smilesColumn=options.smilesColumn, nameColumn=options.nameColumn) logging.info("Creating descriptors for %s molecules...", sm.N) numstructs = sm.N s = raw.MakeStore(properties.GetColumns(), sm.N, options.storage, checkDirectoryExists=False) try: if options.index_inchikey: logging.info("Creating inchi store") cabinet = kyotocabinet.DB() inchi = os.path.join(options.storage, "inchikey.kch") cabinet.open(inchi, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE) else: logging.warning("Not logging inchi (see --index-inchkey)") if options.nameColumn is not None: logging.info("Creating name store") name_cabinet = kyotocabinet.DB() name = os.path.join(options.storage, "name.kch") name_cabinet.open( name, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE) else: logging.warning("Not storing name lookup (see --nameColumn)") logging.info("Number of molecules to process: %s", numstructs) done = False count = 0 numOutput = 0 batchsize = options.batchsize badColumnWarning = False inchies = {} names = {} while 1: lastcount = count if options.nameColumn is not None: joblist, count = getJobsAndNames(sm, options, count, numstructs, batchsize, num_cpus, names) else: joblist, count = getJobs(sm, options, count, numstructs, batchsize, num_cpus) if not joblist: break t1 = time.time() if options.index_inchikey: results = pool.map(processInchi, joblist) else: results = pool.map(process, joblist) procTime = time.time() - t1 for result in results: numOutput += len(result) if numOutput == 0 and not badColumnWarning and len( result) == 0: badColumnWarning = True logging.warning( "no molecules processed in batch, check the smilesColumn" ) logging.warning("First 10 smiles:\n") logging.warning("\n".join([ "%i: %s" % (i, sm.get(i)) for i in range(0, min(sm.N, 10)) ])) flattened = [val for sublist in results for val in sublist] flattened.sort() t1 = time.time() delta = 0.0 # flatten the results so that we store them in index order for result in flattened: if options.index_inchikey: i, v, inchi, key = result if v: try: s.putRow(i, v) except ValueError: logging.exception("Columns: %s\nData: %r", properties.GetColumns(), v) raise if inchi in inchies: inchies[key].append(i) else: inchies[key] = [i] elif options.nameColumn is not None: i, v = result if v: s.putRow(i, v) storeTime = time.time() - t1 logging.info( "Done with %s out of %s. Processing time %0.2f store time %0.2f", count, sm.N, procTime, storeTime) if options.index_inchikey: logging.info("Indexing inchies") t1 = time.time() for k in sorted(inchies): cabinet[k] = repr(inchies[k]) logging.info("... indexed in %2.2f seconds", (time.time() - t1)) if names: t1 = time.time() logging.info("Indexing names") for name in sorted(names): name_cabinet[name] = names[name] logging.info("... indexed in %2.2f seconds", (time.time() - t1)) finally: sm.close() s.close() pool.close()
print 'Specify a test: redis-normal, redis-hashes, kyoto, tokyo' sys.exit(2) if sys.argv[1] == 'redis-normal': REDIS_SETGET = True elif sys.argv[1] == 'redis-hashes': REDIS_HSET = True elif sys.argv[1] == 'kyoto': KYOTO = True elif sys.argv[1] == 'tokyo': TOKYO = True if REDIS_SETGET or REDIS_HSET: p = r.pipeline() elif KYOTO: k = kyotocabinet.DB() if not k.open("/tmp/casket.kch", kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE): print "cannot open db" sys.exit(2) for i in range(0, NUM_ENTRIES): value = random.randint(0, MAX_VAL) if REDIS_SETGET: r.set(str(i), value) elif REDIS_HSET: bucket = int(i / 513) p.hset(bucket, i, value) elif KYOTO: k.set(str(i), value) if i % (NUM_ENTRIES/10) == 0:
def open(self): self.__db = kyotocabinet.DB() if not self.__db.open( self.__dbpath, kyotocabinet.DB.OWRITER | kyotocabinet.DB.OCREATE): raise PAWSError('open error: ' + str(self.__db.error()))