def search(self, obj, exact=False, db=None): """ Search the database for partial matches of [obj], and return a list of matches in the tuple form: ("obj", { "filename_hash": string, "cryptographer": string, "key": string, "storage_provider": string, "bucket": string, "file_hash": string } ) If [exact] == True, then only exact matches will be returned. Since there should only ever be a single exact match for a path in the DB, a CstashCriticalException will be thrown if more than a single element is in the resulting list. This shouldn't be possible anyway, since the DB is a key/value store, but it's a safety measure. """ db = db or self.db db_connection = SqliteDict(db, autocommit=True, flag='r') if exact is True: keys = [(k, db_connection[k]) for k in db_connection.keys() if obj == k] elif obj is None: keys = [(k, db_connection[k]) for k in db_connection.keys()] else: keys = [(k, db_connection[k]) for k in db_connection.keys() if obj in k] if exact is True and len(keys) > 1: raise exceptions.CstashCriticalException(message=(f"Found more than a single match " "for {obj} in the database:\n\n{keys}")) # pylint: disable=bad-continuation db_connection.close() return keys
def adjust_evernote_font(): """ Call for Evernote """ note_info = SqliteDict(conf.db.db_file, autocommit=True) notes_in_evernote = list() for note in get_notes(get_notebooks()): guid = note.guid notes_in_evernote.append(guid) if guid not in note_info.keys() \ or note_info[guid][FONT_SIZE] != conf.font_size \ or note_info[guid][LINE_HEIGHT] != conf.line_height: adjust_note(note) note_info[guid] = {FONT_SIZE: conf.font_size, LINE_HEIGHT: conf.line_height} guids_to_forget = [guid for guid in note_info.keys() if guid not in notes_in_evernote] for guid in guids_to_forget: logging.debug("Delete guid from DB: {}".format(guid)) del note_info[guid] note_info.close()
class ImageCache: def __init__(self, directory): db_dirname = os.path.join(directory, ".mikula") if not os.path.isdir(db_dirname): os.mkdir(db_dirname) db_filename = os.path.join(db_dirname, "images.cache") self.cache = SqliteDict(db_filename) self.recent_lookup_ = None def reset(self): self.cache.clear() self.recent_lookup_ = None def config_changed(self, config): if "config" not in self.cache.keys(): return True stored = self.cache["config"] return stored != config def update_config(self, config): self.cache["config"] = config self.cache.commit() def require_update(self, filename): if filename not in self.cache.keys(): self.recent_lookup_ = None return True timestamp, scaled, scaled_time, thumbnail, thumbnail_time = self.cache[ filename] if os.path.exists(scaled) and os.path.exists(thumbnail): if os.path.getmtime(filename) == timestamp and \ os.path.getmtime(scaled) == scaled_time and \ os.path.getmtime(thumbnail) == thumbnail_time: self.recent_lookup_ = (scaled, thumbnail) return False self.recent_lookup_ = None return True def get_filenames(self): return self.recent_lookup_ def update(self, filename, scaled, thumbnail): timestamp = os.path.getmtime(filename) scaled_time = os.path.getmtime(scaled) thumbnail_time = os.path.getmtime(thumbnail) self.cache[filename] = (timestamp, scaled, scaled_time, thumbnail, thumbnail_time) self.cache.commit()
def get_statistics_true_url(positives_negatives, urls_db): db = SqliteDict(urls_db, autocommit=False) urls = list(db.keys()) file = codecs.open('candidates_without_true_name1_.tsv', 'a') count_exist = 0 count_all = 0 count_not_included = 0 for positive_negative in positives_negatives: entity, beg, end, true_url, context, negative_samples = positive_negative samples = list() for negative_sample in negative_samples: samples.append(negative_sample.strip()) if true_url in samples: count_exist += 1 elif true_url in urls: file.write(str(entity) + '\t' + str(true_url) + '\n') else: count_not_included += 1 count_all += 1 print(count_exist) print(count_all) print(count_not_included) return float(count_exist) / count_all
class CDataBase(object): def __init__(self): try: self.close() except: pass self.mydict = SqliteDict('./DB/my_db.sqlite', autocommit=True) self.show() def set(self, key, value): self.mydict[key] = value def get(self, key): if key in self.mydict.keys(): ret = self.mydict[key] else: ret = None return ret def show(self, start_with=''): for key, value in self.mydict.items(): if key.find(start_with): print(key, '\t', value, '\n') def clear(self): self.mydict.clear() def close(self): self.mydict.close()
def create_completely_random(urls_db, contexts, phrases, n=10): samples = list() db = SqliteDict(urls_db, autocommit=False) url_keys = db.keys() urls = [url for url in url_keys] keys = phrases.keys() count = 0 print(len(keys)) for key in keys: if count % 100 == 0: print(count) count += 1 entity, beg, end, true_url = phrases[key] negatives = [(entity, beg, end, true_url, contexts[key])] shuffle(urls) negative_samples = urls[:n] negatives.extend(negative_samples) samples.append(negatives) return samples
def create_keywords_from_url(url_db): url_keywords = dict() db = SqliteDict(url_db, autocommit=False) urls = db.keys() lemmatizer = WordNetLemmatizer() for url in urls: entity = url.split('/')[-1] if "–" in entity: words = list() first_words = entity.split('_') for word in first_words: words.extend(word.split('–')) elif "-" in entity: words = list() first_words = entity.split('_') for word in first_words: words.extend(word.split('-')) else: words = entity.split('_') keywords = set() for word in words: prepro = word.strip(',').strip('.').strip('(').strip(')').lower() #keywords.add(prepro) keywords.add(lemmatizer.lemmatize(prepro)) url_keywords[url] = keywords db.close() return url_keywords
def clear_db(db_path_shadow: str) -> None: doc_vecs_db = SqliteDict(db_path_shadow) print("Clearing db {}".format(db_path_shadow)) for key in tqdm(doc_vecs_db.keys()): del doc_vecs_db[key] doc_vecs_db.commit() doc_vecs_db.close()
class DiskQueue(): DIR_PATH = './diskqueue' IN_PROGRESS_DB_NAME = 'inprogress.sqlite' TODO_DB_NAME = 'todo.sqlite' SEEN_DB_NAME = 'seen.sqlite' def __init__(self, load: bool = False): self.iterLock = threading.Lock() if not os.path.exists(self.DIR_PATH): os.makedirs(self.DIR_PATH) if not load: for path in [self.IN_PROGRESS_DB_NAME, self.TODO_DB_NAME, self.SEEN_DB_NAME]: try: os.remove('{}/{}'.format(self.DIR_PATH, path)) except: continue self.inProgress = SqliteDict('{}/{}'.format(self.DIR_PATH, self.IN_PROGRESS_DB_NAME), autocommit=True) self.todo = SqliteDict('{}/{}'.format(self.DIR_PATH, self.TODO_DB_NAME), autocommit=True) self.seen = SqliteDict('{}/{}'.format(self.DIR_PATH, self.SEEN_DB_NAME), autocommit=True) # If we need to load state, add everything that was in progress to the todo queue if load: for key in self.inProgress.iterkeys(): self.todo[key] = True del self.inProgress[key] def Push(self, key): if (key not in self.todo) and (key not in self.inProgress) and (key not in self.seen): self.todo[key] = True def Next(self): toReturn = None with self.iterLock: toReturn = next(self.todo.keys(), None) if toReturn: self.inProgress[toReturn] = True del self.todo[toReturn] return toReturn def Done(self, key): self.seen[key] = True del self.inProgress[key] def Close(self): self.inProgress.close() self.todo.close() self.seen.close() def IsDone(self): tmp = False with self.iterLock: tmp = len(self.todo) == 0 and len(self.inProgress) == 0 return tmp
def tag(self, text, label): db = SqliteDict(self.db, tablename='labels') if label not in db.keys(): raise Exception('Label not found check your labels') with open(self.f, 'a+') as file: line = text + '\t' + label + '\n' file.write(line) self.done() return True
def test_default_reuse_existing_flag_c(self): """Re-opening of a database does not destroy it.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname) orig_db['key'] = 'value' orig_db.commit() orig_db.close() next_db = SqliteDict(filename=fname) self.assertIn('key', next_db.keys()) self.assertEqual(next_db['key'], 'value')
def test_overwrite_using_flag_n(self): """Re-opening of a database with flag='c' destroys it all.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname, tablename='sometable') orig_db['key'] = 'value' orig_db.commit() orig_db.close() # verify, next_db = SqliteDict(filename=fname, tablename='sometable', flag='n') self.assertNotIn('key', next_db.keys())
def test_overwrite_using_flag_w(self): """Re-opening of a database with flag='w' destroys only the target table.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db_1 = SqliteDict(filename=fname, tablename='one') orig_db_1['key'] = 'value' orig_db_1.commit() orig_db_1.close() orig_db_2 = SqliteDict(filename=fname, tablename='two') orig_db_2['key'] = 'value' orig_db_2.commit() orig_db_2.close() # verify, when re-opening table space 'one' with flag='2', we destroy # its contents. However, when re-opening table space 'two' with # default flag='r', its contents remain. next_db_1 = SqliteDict(filename=fname, tablename='one', flag='w') self.assertNotIn('key', next_db_1.keys()) next_db_2 = SqliteDict(filename=fname, tablename='two') self.assertIn('key', next_db_2.keys())
def load_embeddings(embeddings_path): """Loads pre-trained word embeddings from tsv file. Args: embeddings_path - path to the embeddings file. Returns: embeddings - dict mapping words to vectors; embeddings_dim - dimension of the vectors. """ embeddings = SqliteDict(RESOURCE_PATH['WORD_EMBEDDINGS']) embeddings_dim = embeddings[next(embeddings.keys())].shape[0] return embeddings, embeddings_dim
def search_for_query(query, sim_thresh=0.8) -> str: query_map = SqliteDict(query_map_path) max_sim_query = "" max_sim = 0 for key in query_map.keys(): sim = sim_query(key, query) if sim > max_sim: max_sim = sim max_sim_query = key query_map.close() if max_sim > sim_thresh: return max_sim_query else: return ""
def predict(config): device = torch.device('cuda') loader = DataLoader(TrainEvalDataset( config.dataset(split='train', **config.dataset_parameter), config), 1000, False, num_workers=num_processor) test_loader = DataLoader(TrainEvalDataset( config.dataset(split='test', **config.dataset_parameter), config), 1000, False, num_workers=num_processor) net = NetModel(config.net) net = net.to(device) storage_dict = SqliteDict(f'{config.output_dir}/dcl_snap.db') if len(storage_dict) > 0: kk = list(storage_dict.keys()) if config.predict.load_epoach == -1: config.predict.load_epoach = kk[-1] net.load_state_dict(torch.load(BytesIO( storage_dict[config.predict.load_epoach]), map_location=device), strict=True) logger.info(f'loading from epoach {config.predict.load_epoach}') net.eval() outputs = {} keys = ['softmax'] if config.predict.values_to_save is not None and len( config.predict.values_to_save) > 0: keys = config.predict.values_to_save for k in keys: outputs[k] = [] for batch_cnt, batch in tqdm(enumerate(chain(loader, test_loader)), total=len(loader) + len(test_loader)): image, label = batch image = image.to(device) net_out = net(image) net_out.update(label) out = {} for k in keys: outputs[k].append(net_out[k].detach().cpu()) # record = {} for k in keys: outputs[k] = torch.cat(outputs[k]) pickle.dump((outputs), open(f'{config.output_dir}/predict.pkl', 'wb'))
def test_special_keys(self): """integer, float and/or tuple keys""" db = SqliteDict() db['1'] = 1 db[1] = 'ONE' db[('a', 1)] = 'testtuple' db[frozenset([1, 2, '2'])] = 'testfrozenset' assert db[1] == 'ONE' assert db['1'] == 1 assert db[('a', 1)] == 'testtuple' assert db[frozenset([1, 2, '2'])] == 'testfrozenset' # This tests the reverse conversion keys = list(db.keys()) assert len(keys) == 4 assert '1' in keys assert 1 in keys assert ('a', 1) in keys assert frozenset([1, 2, '2']) in keys
class Database: def __init__(self): self.db = SqliteDict("database.sqlite") if "echoes" not in self.db.keys(): self.db["echoes"] = [] self.db.commit() def add_to_echoes(self, echo_text: str): if echo_text not in self.db["echoes"]: self.db["echoes"] += [echo_text] self.db.commit() def get_random_echo(self) -> str: try: return choice(self.db["echoes"]) except IndexError: return "Congrats, that's the first message in my database!" def get_stats(self) -> str: return f"Message count: {len(self.db['echoes'])}" def get_dump(self) -> BytesIO: dump_bytes = BytesIO( bytes("\n".join(self.db['echoes']), encoding="utf-8")) dump_bytes.name = "dump.txt" return dump_bytes def overwrite(self, ow_bytes: BytesIO): ow_bytes.seek(0) ow_list = ow_bytes.read().decode("utf-8").split("\n") self.db["echoes"] = ow_list self.db.commit() def remove_echo(self, echo_text: str) -> str: try: self.db["echoes"].remove(echo_text) self.db.commit() return "Successfully removed from my database!" except ValueError: return "That text isn't in my database!"
class SQLiteStore(KeyStore): ''' A KeyStore that stores data in a SQLite file. Typical usage:: >>> store = SQLiteStore('file.db', table='store') >>> value = store.load(key) >>> store.dump(key, value) Values are encoded as JSON using gramex.config.CustomJSONEncoder (thus handling datetime.) Keys are JSON encoded. ''' def __init__(self, path, table='store', *args, **kwargs): super(SQLiteStore, self).__init__(*args, **kwargs) self.path = _create_path(path) from sqlitedict import SqliteDict self.store = SqliteDict( self.path, tablename=table, autocommit=True, encode=lambda v: json.dumps(v, separators=(',', ':'), ensure_ascii=True, cls=CustomJSONEncoder), decode=lambda v: json.loads( v, object_pairs_hook=AttrDict, cls=CustomJSONDecoder), ) def close(self): self.store.close() def flush(self): super(SQLiteStore, self).flush() self.store.commit() def keys(self): # Keys need to be escaped return (self._escape(key) for key in self.store.keys()) def purge(self): app_log.debug('Purging %s', self.path) super(SQLiteStore, self).purge()
def stat(dbname): database = SqliteDict(dbname) print(f"{len(database)} records") counts = {} l = 0 for key in database.keys(): player, stones = eval(key) val = database[key] for hole_idx in val: diff_total, count = val[hole_idx] if count not in counts: counts[count] = 0 counts[count] += 1 # l += 1 # if l > 10: # break print("distribution of moves ever played") for a, b in sorted(list(counts.items()), key = lambda v: v[1]): print(f" {b} {a}")
def create_db_from_dictdb(lookup_db_path, longabs_db_path, labels_db_path, db_name): connection = sqlite3.connect(db_name) cursor = connection.cursor() cursor.execute( '''CREATE TABLE graph (node_id INTEGER PRIMARY KEY NOT NULL, long_abstracts TEXT, labels TEXT)''' ) connection.commit() lookup_db = SqliteDict(lookup_db_path, autocommit=False) longabs_db = SqliteDict(longabs_db_path, autocommit=False) labels_db = SqliteDict(labels_db_path, autocommit=False) intersection_nodes = lookup_db.keys() count = 0 for node in intersection_nodes: longab = longabs_db[node] label = labels_db[node] id = lookup_db[node] cursor.execute('''INSERT INTO graph VALUES (?,?,?)''', (id, longab, label)) if count % 100000 == 0: print(count) connection.commit() count += 1 connection.commit() connection.close() lookup_db.close() labels_db.close() longabs_db.close()
def filter_negative_samples_randomly(positives_negatives, url_db, n=10): filtered_samples = list() db = SqliteDict(url_db, autocommit=False) keys = db.keys() urls = [key for key in keys] for positive_negative in positives_negatives: entity, beg, end, true_url, context, negative_samples = positive_negative[0], int(positive_negative[1]), int(positive_negative[2]), \ positive_negative[3], positive_negative[4], positive_negative[5:][0] try: # check if db contains this url or not. id = db[true_url] except KeyError: # if not, skip it. continue addition = list() # if it does not have negative samples, then completely random samples are created. length = len(negative_samples) if length == 0: negative_samples = urls elif length < n: shuffle(urls) size = n - len(negative_samples) addition = urls[:size] shuffle(negative_samples) filtered_sample = [(entity, beg, end, true_url, context)] negative_samples.extend(addition) filtered_sample.extend(negative_samples[:n]) filtered_samples.append(filtered_sample) return filtered_samples
def create_nodes_from_db(self, longabsdb_path, labelsdb_path, lookupdb_path, subnodes=False): longabsdb = SqliteDict(longabsdb_path, autocommit=False) labelsdb = SqliteDict(labelsdb_path, autocommit=False) lookupdb = SqliteDict(lookupdb_path, autocommit=False) if subnodes: urls = subnodes else: urls = lookupdb.keys() count = 0 for url in urls: # long abstract is string. long_abstract = longabsdb[url] # title is string. title = labelsdb[url] # node id is the integer value. node_id = int(lookupdb[url]) # id, url, long abstract (text), and title are attributes. self._G.add_node(node_id, id=node_id, url=url, long_abstract=long_abstract, title=title) if count % 100000 == 0: self._logger.info(str(count) + ' nodes are processed..') count += 1 longabsdb.close() labelsdb.close() lookupdb.close()
class LORELEIKBLoader: """ Class for loading the LORELEI knowledge base (KB). """ def __init__(self, kbfile): self.kb = {} # eid --> item self.name2ent = {} self.load_kb(kbfile) """ Primary function of the class -- loads the LORELEI kb which is to be found at the provided path. """ def load_kb(self, kbfile): e2e_path = kbfile + "e2e.pkl" n2e_path = kbfile + "n2e.pkl" if os.path.exists(e2e_path): logging.info("pkl found! loading map %s", e2e_path) self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r') self.name2ent = SqliteDict(n2e_path, tablename='name2ent', flag='r') else: logging.info("pkl not found ...") self.kb = SqliteDict(e2e_path, tablename='lorelei', autocommit=False) self.name2ent = SqliteDict(n2e_path, tablename='name2ent', autocommit=False) try: for idx, line in enumerate(open(kbfile)): if idx > 0 and idx % 1000000 == 0: logging.info("read %d lines", idx) parts = line.rstrip('\n').split('\t') if len(parts) != len(fields): logging.info("bad line %d", idx) continue endict = {} for field, v in zip(fields, parts): if len(v) != 0: endict[field] = v self.kb[endict['entityid']] = endict name = endict['name'] if name not in self.name2ent: self.name2ent[name] = [] lst = self.name2ent[name] lst.append(endict) self.name2ent[name] = lst logging.info("Writing KB dictionary to disk.") self.kb.commit() self.kb.close() self.name2ent.commit() self.name2ent.close() self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r') self.name2ent = SqliteDict(n2e_path, tablename='name2ent', flag='r') except KeyboardInterrupt: logging.info("ending prematurely.") logging.info("Writing KB dictionary to disk.") self.kb.commit() self.kb.close() self.name2ent.commit() self.name2ent.close() # reopen the kb now self.kb = SqliteDict(e2e_path, tablename='geonames', jflag='r') self.name2ent = SqliteDict(n2e_path, tablename='name2ent', flag='r') def __getitem__(self, item): return self.kb[item] def keys(self): return self.kb.keys()
def OptimizationHistory(self): """ Reads in database history file and stores contents. Function information is stored as a dict in func_data, variable information is stored as a dict in var_data, and bounds information is stored as a dict in bounds. """ # Initialize dictionaries for design variables and unknowns. # The data is saved redundantly in dicts for all iterations and then # for major iterations as well. self.func_data_all = {} self.func_data_major = {} self.var_data_all = {} self.var_data_major = {} db = {} self.num_iter = 0 # Loop over each history file name provided by the user. for histIndex, histFileName in enumerate(self.histList): # If they only have one history file, we don't change the keys' names if len(self.histList) == 1: histIndex = '' else: # If multiple history files, append letters to the keys, # such that 'key' becomes 'key_A', 'key_B', etc histIndex = '_' + chr(histIndex + ord('A')) self.histIndex = histIndex try: # This is the classic method of storing history files db = shelve.open(histFileName, 'r') OpenMDAO = False except: # Bare except because error is not in standard Python. # If the db has the 'iterations' tag, it's an OpenMDAO db. db = SqliteDict(histFileName, 'iterations') OpenMDAO = True # Need to do this since in py3 db.keys() is a generator object keys = [i for i in db.keys()] # If it has no 'iterations' tag, it's a pyOptSparse db. if keys == []: OpenMDAO = False db = SqliteDict(histFileName) # Specific instructions for OpenMDAO databases if OpenMDAO: # Get the number of iterations by looking at the largest number # in the split string names for each entry in the db if major_python_version == 3: for string in db.keys(): string = string.split('|') else: string = db.keys()[-1].split('|') nkey = int(string[-1]) self.solver_name = string[0] # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Get the keys of the database where derivatives were evaluated. # These correspond to major iterations, while no derivative # info is calculated for gradient-free linesearches. deriv_keys = SqliteDict(histFileName, 'derivs').keys() self.deriv_keys = [ int(key.split('|')[-1]) for key in deriv_keys ] # Save information from the history file for the funcs. self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO) # Save information from the history file for the unknowns. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str='Unknowns') # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str='Parameters') # Add labels to OpenMDAO variables. # Corresponds to constraints, design variables, and objective. try: db = SqliteDict(histFileName, 'metadata') self.SaveOpenMDAOData(db) except KeyError: # Skip metadata info if not included in OpenMDAO hist file pass else: # Get the number of iterations nkey = int(db['last']) + 1 self.nkey = nkey # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Check to see if there is bounds information in the db file. # If so, add them to self.bounds to plot later. try: try: info_dict = db['varInfo'].copy() info_dict.update(db['conInfo']) scale_info = True except KeyError: self.warning_display( 'This is an older optimization history file.\n' + 'Only bounds information has been stored, not scalar info.' ) info_dict = db['varBounds'].copy() info_dict.update(db['conBounds']) scale_info = False # Got to be a little tricky here since we're modifying # info_dict; if we simply loop over it with the generator # from Python3, it will contain the new keys and then the # names will be mangled incorrectly. bounds_dict = {} scaling_dict = {} for key in info_dict.keys(): bounds_dict[key + histIndex] = { 'lower': info_dict[key]['lower'], 'upper': info_dict[key]['upper'] } if scale_info: scaling_dict[key + histIndex] = info_dict[key]['scale'] self.bounds.update(bounds_dict) if scale_info: self.scaling.update(scaling_dict) except KeyError: pass # Check to see if there is proper saved info about iter type if 'isMajor' in db['0'].keys(): self.storedIters = True else: self.storedIters = False # Save information from the history file for the funcs. self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO) # Save information from the history file for the funcs. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str='funcs') # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str='xuser') # Set the initial dictionaries to reference all iterations. # Later this can be set to reference only the major iterations. self.func_data = self.func_data_all self.var_data = self.var_data_all # Find the maximum length of any variable in the dictionaries and # save this as the number of iterations. for data_dict in [self.func_data, self.var_data]: for key in data_dict.keys(): length = len(data_dict[key]) if length > self.num_iter: self.num_iter = length
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock( ) if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate( ) # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict( journal_mode=JOURNAL_MODE ) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning( "too few training documents; using simple log-entropy model instead of latent semantic indexing" ) method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.model = None
from sqlitedict import SqliteDict db = SqliteDict() opts = ["ipopt", "slsqp", "snopt", "fsqp", "conmin", "nlpqlp", "psqp"] for opt in opts: fileName = "%s_hs015_Hist.hst" % opt try: db[opt] = SqliteDict(fileName) except: # noqa: E722 pass obj = {} x1 = {} x2 = {} for opt in db.keys(): n = int(db[opt]["last"]) obj[opt] = [] x1[opt] = [] x2[opt] = [] for i in range(n): try: obj[opt].append(db[opt]["%d" % i]["funcs"]["obj"]) x1[opt].append(db[opt]["%d" % i]["xuser"]["xvars"][0]) x2[opt].append(db[opt]["%d" % i]["xuser"]["xvars"][1]) except: # noqa: E722 pass # Generate the Rosenbrock contours delta = 0.25
outputfile = args.outputfile files = args.file reset = args.reset if reset: flag = 'w' else: flag = 'c' mergehll = SqliteDict(outputfile, flag=flag, journal_mode='MEMORY', encode=hll_encode, decode=hll_decode) mergekeys = set() for k in mergehll.keys(): mergekeys.add(k) for f in files: if not os.path.exists(f): print("{}: File not found".format(f)) sys.exit() filehll = SqliteDict(f, encode=hll_encode, decode=hll_decode) # merge into outputfile for tld, tldhll in filehll.iteritems(): if tld in mergekeys: # key already in outputfile -> merge hll = mergehll[tld] if not hll.__eq__(tldhll): # merge only if HLL is different hll.update(tldhll) mergehll[tld] = hll else: # new key in outfile -> copy
class IMAPMailbox(ExtendedMaildir): implements(imap4.IMailbox, imap4.ICloseableMailbox) AppendFactory = SerpentAppendMessageTask def __init__(self, path): maildir.initializeMaildir(path) self.listeners = [] self.path = path self.open_flags() self.lastadded = None self.__check_flags_() def open_flags(self): self.msg_info = SqliteDict(os.path.join(self.path, conf.imap_msg_info)) self.mbox_info = SqliteDict(os.path.join(self.path, conf.imap_mbox_info)) def _start_monitor(self): self.notifier = inotify.INotify() self.notifier.startReading() self.notifier.watch(filepath.FilePath(os.path.join(self.path, 'new')), callbacks=[self._new_files]) self.notifier.watch(filepath.FilePath(os.path.join(self.path,'cur')), callbacks=[self._new_files]) def _stop_monitor(self): self.notifier.stopReading() self.notifier.loseConnection() def _new_files(self, wo, path, code): if code == inotify.IN_MOVED_TO or code == inotify.IN_DELETE: for l in self.listeners: l.newMessages(self.getMessageCount(), self.getRecentCount()) def __check_flags_(self): if 'subscribed' not in self.mbox_info.keys(): self.mbox_info['subscribed'] = False if 'flags' not in self.mbox_info.keys(): self.mbox_info['flags'] = [] if 'special' not in self.mbox_info.keys(): self.mbox_info['special'] = '' if 'uidvalidity' not in self.mbox_info.keys(): self.mbox_info['uidvalidity'] = random.randint(0, 2**32) if 'uidnext' not in self.mbox_info.keys(): self.mbox_info['uidnext'] = 1 #self.mbox_info.commit(blocking=False) # XXX l = [l for l in self.__msg_list_()] for i in l: fn = i.split('/')[-1] if fn not in self.msg_info.keys(): val1 = {'uid': self.getUIDNext()} if i.split('/')[-2] == 'new': val1['flags'] = [] else: val1['flags'] = [misc.IMAP_FLAGS['SEEN']] self.msg_info[fn] = val1 #self.msg_info.commit(blocking=False) # XXX def subscribe(self): self.mbox_info['subscribed'] = True #self.mbox_info.commit(blocking=False) # XXX def unsubscribe(self): self.mbox_info['subscribed'] = False #self.mbox_info.commit(blocking=False) # XXX def is_subscribed(self): return self.mbox_info['subscribed'] def __count_flagged_msgs_(self, flag): val1 = [0 for fn in self.msg_info.keys() if flag in self.msg_info[fn]['flags']] return len(val1) def getHierarchicalDelimiter(self): return misc.IMAP_HDELIM def setSpecial(self, special): self.mbox_info['special'] = special #self.mbox_info.commit(blocking=False) # XXX def getFlags(self): return sorted(misc.IMAP_FLAGS.values()) def getMboxFlags(self): f = list(self.mbox_info['flags']) if self.mbox_info['special'] != '': f.append(self.mbox_info['special']) return f def addFlag(self, flag): self.mbox_info['flags'] = list(set(self.mbox_info['flags']).union([flag])) #self.mbox_info.commit(blocking=False) # XXX def removeFlag(self, flag): self.mbox_info['flags'] = list(set(self.mbox_info['flags']).difference([flag])) #self.mbox_info.commit(blocking=False) # XXX def hasChildren(self): flags = self.getFlags() if misc.MBOX_FLAGS['HASCHILDREN'] not in flags: self.addFlag(misc.MBOX_FLAGS['HASCHILDREN']) if misc.MBOX_FLAGS['HASNOCHILDREN'] in flags: self.removeFlag(misc.MBOX_FLAGS['HASNOCHILDREN']) def hasNoChildren(self): flags = self.getFlags() if misc.MBOX_FLAGS['HASNOCHILDREN'] not in flags: self.addFlag(misc.MBOX_FLAGS['HASNOCHILDREN']) if misc.MBOX_FLAGS['HASCHILDREN'] in flags: self.removeFlag(misc.MBOX_FLAGS['HASCHILDREN']) def getMessageCount(self): val1 = [0 for fn in self.msg_info.keys() if misc.IMAP_FLAGS['DELETED'] not in self.msg_info[fn]['flags']] return len(val1) def getRecentCount(self): c = 0 for fn in self.msg_info.keys(): if misc.IMAP_FLAGS['RECENT'] in self.msg_info[fn]['flags']: c += 1 info = self.msg_info[fn] info['flags'] = set(info['flags']).difference(set([misc.IMAP_FLAGS['RECENT']])) self.msg_info[fn] = info #self.msg_info.commit(blocking=False) # XXX return c def getUnseenCount(self): return self.getMessageCount() - self.__count_flagged_msgs_(misc.IMAP_FLAGS['SEEN']) def isWriteable(self): return True def getUIDValidity(self): return self.mbox_info['uidvalidity'] def getUIDNext(self): un = self.mbox_info['uidnext'] self.mbox_info['uidnext'] += 1 #self.mbox_info.commit(blocking=False) # XXX return un def getUID(self, num): return num def addMessage(self, message, flags = (), date = None): return self.appendMessage(message).addCallback(self._cbAddMessage, flags) def _cbAddMessage(self, obj, flags): path = self.lastadded self.lastadded = None fn = path.split('/')[-1] self.msg_info[fn] = {'uid': self.getUIDNext(), 'flags': flags} #self.msg_info.commit(blocking=False) # XXX if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur': new_path = os.path.join(self.path, 'cur', fn) os.rename(path, new_path) def __msg_list_(self): a = [] for m in os.listdir(os.path.join(self.path, 'new')): a.append(os.path.join(self.path, 'new', m)) for m in os.listdir(os.path.join(self.path, 'cur')): a.append(os.path.join(self.path, 'cur', m)) return a def _seqMessageSetToSeqDict(self, messageSet): if not messageSet.last: messageSet.last = self.getMessageCount() seqMap = {} msgs = self.__msg_list_() for messageNum in messageSet: if messageNum > 0 and messageNum <= self.getMessageCount(): seqMap[messageNum] = msgs[messageNum - 1] return seqMap def fetch(self, messages, uid): return [[seq, MaildirMessage(seq, file(filename, 'rb').read(), self.msg_info[filename.split('/')[-1]]['flags'], rfc822date())] for seq, filename in self.__fetch_(messages, uid).iteritems()] def __fetch_(self, messages, uid): if uid: messagesToFetch = {} if not messages.last: messages.last = self.mbox_info['uidnext'] fn_uid = dict((fn, self.msg_info[fn]['uid']) for fn in self.msg_info.keys()) for uid in messages: if uid in fn_uid.values(): for name, _id in fn_uid.iteritems(): if uid == _id: if os.path.exists(os.path.join(self.path,'new', name)): messagesToFetch[uid] = os.path.join(self.path,'new', name) elif os.path.exists(os.path.join(self.path,'cur', name)): messagesToFetch[uid] = os.path.join(self.path,'cur', name) else: messagesToFetch = self._seqMessageSetToSeqDict(messages) return messagesToFetch def store(self, messages, flags, mode, uid): d = {} for _id, path in self.__fetch_(messages, uid).iteritems(): filename = path.split('/')[-1] if mode < 0: old_f = self.msg_info[filename] old_f['flags'] = list(set(old_f['flags']).difference(set(flags))) self.msg_info[filename] = old_f if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'new': new_path = os.path.join(self.path, 'new', filename) os.rename(path, new_path) elif mode == 0: old_f = self.msg_info[filename] old_f['flags'] = flags self.msg_info[filename] = old_f elif mode > 0: old_f = self.msg_info[filename] old_f['flags'] = list(set(old_f['flags']).union(set(flags))) self.msg_info[filename] = old_f if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur': new_path = os.path.join(self.path, 'cur', filename) os.rename(path, new_path) d[_id] = self.msg_info[filename]['flags'] #self.msg_info.commit(blocking=False) # XXX return d def expunge(self): uids = [] for path in self.__msg_list_(): fn = path.split('/')[-1] if fn not in self.msg_info.keys(): continue uid = self.msg_info[fn]['uid'] if misc.IMAP_FLAGS['DELETED'] in self.msg_info[fn]['flags']: os.remove(path) del self.msg_info[fn] uids.append(uid) #self.msg_info.commit(blocking=False) # XXX return uids def addListener(self, listener): self.listeners.append(listener) return True def removeListener(self, listener): self.listeners.remove(listener) return True def requestStatus(self, names): return imap4.statusRequestHelper(self, names) def destroy(self): pass def close(self): print('!!! %s - %d !!!' % (self.path, len(self.listeners))) if len(self.listeners) == 0: self._stop_monitor() if conf.imap_expunge_on_close: self.expunge() self.msg_info.commit(blocking=False) self.mbox_info.commit(blocking = False) self.msg_info.close() self.mbox_info.close()
from sqlitedict import SqliteDict # PageID -> [Parent1, Parent2 ,. ..] pageID2Parent = SqliteDict('./db/pageID2Parent.sqlite') # PageID -> [PageTitle, LastModified, Size, WordFreqDict, Children] pageID2Meta = SqliteDict('./db/pageID2Meta.sqlite') def findParent(child_id): _parents = [] for parent_id, meta in pageID2Meta.items(): if int(child_id) in meta[4]: _parents.append(parent_id) return _parents i = 0 for page_id in pageID2Meta.keys(): print(str(i), len(pageID2Meta)) i += 1 parents = findParent(page_id) pageID2Parent[page_id] = parents pageID2Parent.commit() pageID2Parent.close() pageID2Meta.close()
print print('bat out v: %f ' % top['battery.output_voltage']) print('invert in v: %f' % top['inverter.input_voltage']) print('mot in volt: %f' % top['motor.phase_voltage']) print('invert out volt: %f' % top['inverter.output_voltage']) print print('invert in pow %f' % top['inverter.input_power']) in_pow = top['inverter.input_voltage'] * top['inverter.input_current'] print('calc inverter in pow %f' % in_pow) output_power = top['inverter.output_voltage'] * top[ 'inverter.output_current'] * 3.0 * np.sqrt(2.0 / 3.0) print('calc output pow %f' % output_power) print('bat des pow: %f' % top['battery.des_power']) print print('Inv in cur %f' % top['inverter.input_current']) print('mot des pow %f' % top['design_power']) print('mot input cur %f' % top['motor.phase_current']) print('mot input volt %f' % top['motor.phase_voltage']) # print('ncells %f' % top['Battery']) db = SqliteDict('drivetraindb', 'openmdao') pprint(db.keys()) data = db['rank0:Driver/1'] pprint(data['Parameters']) print print pprint(data['Unknowns']) top.cleanup() remove('drivetraindb')
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate() # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict(journal_mode=JOURNAL_MODE) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning("too few training documents; using simple log-entropy model instead of latent semantic indexing") method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.model = None
def OptimizationHistory(self): """ Reads in database history file and stores contents. Function information is stored as a dict in func_data, variable information is stored as a dict in var_data, and bounds information is stored as a dict in bounds. """ # Initialize dictionaries for design variables and unknowns. # The data is saved redundantly in dicts for all iterations and then # for major iterations as well. self.func_data_all = {} self.func_data_major = {} self.var_data_all = {} self.var_data_major = {} db = {} self.num_iter = 0 # Loop over each history file name provided by the user. for histIndex, histFileName in enumerate(self.histList): # If they only have one history file, we don't change the keys' names if len(self.histList) == 1: histIndex = "" else: # If multiple history files, append letters to the keys, # such that 'key' becomes 'key_A', 'key_B', etc histIndex = "_" + chr(histIndex + ord("A")) self.histIndex = histIndex try: # This is the classic method of storing history files db = shelve.open(histFileName, "r") OpenMDAO = False except Exception: # Bare except because error is not in standard Python. # If the db has the 'iterations' tag, it's an OpenMDAO db. db = SqliteDict(histFileName, "iterations") OpenMDAO = True # Need to do this since in py3 db.keys() is a generator object keys = [i for i in db.keys()] # If it has no 'iterations' tag, it's a pyOptSparse db. if keys == []: OpenMDAO = False db = SqliteDict(histFileName) # Specific instructions for OpenMDAO databases if OpenMDAO: # Get the number of iterations by looking at the largest number # in the split string names for each entry in the db for string in db.keys(): string = string.split("|") nkey = int(string[-1]) self.solver_name = string[0] # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Get the keys of the database where derivatives were evaluated. # These correspond to major iterations, while no derivative # info is calculated for gradient-free linesearches. deriv_keys = SqliteDict(histFileName, "derivs").keys() self.deriv_keys = [ int(key.split("|")[-1]) for key in deriv_keys ] # Save information from the history file for the funcs. self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO) # Save information from the history file for the unknowns. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str="Unknowns") # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str="Parameters") # Add labels to OpenMDAO variables. # Corresponds to constraints, design variables, and objective. try: db = SqliteDict(histFileName, "metadata") self.SaveOpenMDAOData(db) except KeyError: # Skip metadata info if not included in OpenMDAO hist file pass else: # Get the number of iterations nkey = int(db["last"]) + 1 self.nkey = nkey # Initalize a list detailing if the iterations are major or minor # 1 = major, 2 = minor, 0 = sensitivity (or duplicated info by IPOPT) # The entries whose iter_type = 0 will be ignored. self.iter_type = np.zeros(nkey) # Check to see if there is bounds information in the db file. # If so, add them to self.bounds to plot later. try: try: info_dict = db["varInfo"].copy() info_dict.update(db["conInfo"]) scale_info = True except KeyError: self.warning_display( "This is an older optimization history file.\n" + "Only bounds information has been stored, not scalar info." ) info_dict = db["varBounds"].copy() info_dict.update(db["conBounds"]) scale_info = False # Got to be a little tricky here since we're modifying # info_dict; if we simply loop over it with the generator # from Python3, it will contain the new keys and then the # names will be mangled incorrectly. bounds_dict = {} scaling_dict = {} for key in info_dict.keys(): bounds_dict[key + histIndex] = { "lower": info_dict[key]["lower"], "upper": info_dict[key]["upper"], } if scale_info: scaling_dict[key + histIndex] = info_dict[key]["scale"] self.bounds.update(bounds_dict) if scale_info: self.scaling.update(scaling_dict) except KeyError: pass # Check to see if there is proper saved info about iter type if "isMajor" in db["0"].keys(): self.storedIters = True else: self.storedIters = False # Raise warning for IPOPT's duplicated history if db["metadata"]["optimizer"] == "IPOPT" and "iter" not in db[ "0"].keys(): pyOptSparseWarning( "The optimization history file has duplicated entries at every iteration, and the OptView plot is not correct. " + "Re-run the optimization with a current version of pyOptSparse to generate a correct history file." ) # Save information from the history file for the funcs. self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO) # Save information from the history file for the funcs. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str="funcs") # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str="xuser") # Set the initial dictionaries to reference all iterations. # Later this can be set to reference only the major iterations. self.func_data = self.func_data_all self.var_data = self.var_data_all # Find the maximum length of any variable in the dictionaries and # save this as the number of iterations. for data_dict in [self.func_data, self.var_data]: for key in data_dict.keys(): length = len(data_dict[key]) if length > self.num_iter: self.num_iter = length
def OptimizationHistory(self): """ Reads in database history file and stores contents. Function information is stored as a dict in func_data, variable information is stored as a dict in var_data, and bounds information is stored as a dict in bounds. """ # Initialize dictionaries for design variables and unknowns. # The data is saved redundantly in dicts for all iterations and then # for major iterations as well. self.func_data_all = {} self.func_data_major = {} self.var_data_all = {} self.var_data_major = {} db = {} self.num_iter = 0 # Loop over each history file name provided by the user. for histIndex, histFileName in enumerate(self.histList): # If they only have one history file, we don't change the keys' names if len(self.histList) == 1: histIndex = '' else: # If multiple history files, append letters to the keys, # such that 'key' becomes 'key_A', 'key_B', etc histIndex = '_' + chr(histIndex + ord('A')) self.histIndex = histIndex try: # This is the classic method of storing history files db = shelve.open(histFileName, 'r') OpenMDAO = False except: # Bare except because error is not in standard Python. # If the db has the 'iterations' tag, it's an OpenMDAO db. db = SqliteDict(histFileName, 'iterations') OpenMDAO = True # If it has no 'iterations' tag, it's a pyOptSparse db. if db.keys() == []: OpenMDAO = False db = SqliteDict(histFileName) # Specific instructions for OpenMDAO databases if OpenMDAO: # Get the number of iterations by looking at the largest number # in the split string names for each entry in the db if major_python_version == 3: for string in db.keys(): string string = string.split('|') else: string = db.keys()[-1].split('|') nkey = int(string[-1]) self.solver_name = string[0] # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Get the keys of the database where derivatives were evaluated. # These correspond to major iterations, while no derivative # info is calculated for gradient-free linesearches. deriv_keys = SqliteDict(histFileName, 'derivs').keys() self.deriv_keys = [int(key.split('|')[-1]) for key in deriv_keys] # Save information from the history file for the unknowns. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str='Unknowns') # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str='Parameters') # Add labels to OpenMDAO variables. # Corresponds to constraints, design variables, and objective. try: db = SqliteDict(histFileName, 'metadata') self.SaveOpenMDAOData(db) except KeyError: # Skip metadata info if not included in OpenMDAO hist file pass else: # Get the number of iterations nkey = int(db['last']) + 1 self.nkey = nkey # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Check to see if there is bounds information in the db file. # If so, add them to self.bounds to plot later. try: bounds_dict = dict(db['varBounds'].items() + db['conBounds'].items()) for key in bounds_dict.keys(): bounds_dict[key + histIndex] = bounds_dict.pop(key) self.bounds.update(bounds_dict) except KeyError: pass # Check to see if there is proper saved info about iter type if 'isMajor' in db['0'].keys(): self.storedIters = True else: self.storedIters = False # Save information from the history file for the funcs. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str='funcs') # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str='xuser') # Set the initial dictionaries to reference all iterations. # Later this can be set to reference only the major iterations. self.func_data = self.func_data_all self.var_data = self.var_data_all # Find the maximum length of any variable in the dictionaries and # save this as the number of iterations. for data_dict in [self.func_data, self.var_data]: for key in data_dict.keys(): length = len(data_dict[key]) if length > self.num_iter: self.num_iter = length
class LORELEIKBLoader: """ Class for loading the LORELEI knowledge base (KB). The class allows the KB data to be accessed in two ways, either using the unique KB id or using the surface of a mention which may map to multiple records in the kb. """ def __init__(self, kbfile): # map of entity id to kb record self.kb = {} # map of surface form of mention to list of kb records to which # it may refer. self.name2ent = {} self._load_kb(kbfile) def _load_kb(self, kbfile): """ Helper function which builds the primary resources of the class. Loads the LORELEI kb data which is to be found at the provided path. Checks if the source data has already been preprocessed and stored as a dictionary. If it has, then those files are loaded in, if not then the dictonaries are first built and saved, then loaded. @param: kbfile, the path to source kb which will be loaded """ e2e_path = kbfile + "e2e.pkl" n2e_path = kbfile + "n2e.pkl" if os.path.exists(e2e_path): logging.info("pkl found! loading map %s", e2e_path) self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r') self.name2ent = SqliteDict(n2e_path, tablename='name2ent', flag='r') else: logging.info("pkl not found ...") self.kb = SqliteDict(e2e_path, tablename='lorelei', autocommit=False) self.name2ent = SqliteDict(n2e_path, tablename='name2ent', autocommit=False) try: for idx, line in enumerate(open(kbfile)): if idx > 0 and idx % 1000000 == 0: logging.info("read %d lines", idx) parts = line.rstrip('\n').split('\t') if len(parts) != len(fields): logging.info("bad line %d", idx) continue endict = {} for field, v in zip(fields, parts): if len(v) != 0: endict[field] = v self.kb[endict['entityid']] = endict name = endict['name'] if name not in self.name2ent: self.name2ent[name] = [] lst = self.name2ent[name] lst.append(endict) self.name2ent[name] = lst logging.info("Writing KB dictionary to disk.") self.kb.commit() self.kb.close() self.name2ent.commit() self.name2ent.close() self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r') self.name2ent = SqliteDict(n2e_path, tablename='name2ent', flag='r') except KeyboardInterrupt: logging.info("ending prematurely.") logging.info("Writing KB dictionary to disk.") self.kb.commit() self.kb.close() self.name2ent.commit() self.name2ent.close() # reopen the kb now self.kb = SqliteDict(e2e_path, tablename='geonames', jflag='r') self.name2ent = SqliteDict(n2e_path, tablename='name2ent', flag='r') def __getitem__(self, item): return self.kb[item] def keys(self): return self.kb.keys()
def object_list(self): ''' returns a list of objects ''' objects_metadata = SqliteDict(self._meta.filename, 'objects') return objects_metadata.keys()