def assertIterationDataRecorded(self, expected, tolerance, root): if self.comm.rank != 0: return db = SqliteDict(self.filename, self.tablename) _assertIterationDataRecorded(self, db, expected, tolerance) db.close()
def test_sqlitedict_write(text): d = SqliteDict(f'debug_{datasize}.sqlite') for j in range(3): for i, line in enumerate(text): d[str(i + j * len(text))] = line d.commit() d.close()
def assertMetadataRecorded(self, expected): if self.comm.rank != 0: return db = SqliteDict(self.filename, self.tablename_metadata) _assertMetadataRecorded(self, db, expected) db.close()
def adjust_evernote_font(): """ Call for Evernote """ note_info = SqliteDict(conf.db.db_file, autocommit=True) notes_in_evernote = list() for note in get_notes(get_notebooks()): guid = note.guid notes_in_evernote.append(guid) if guid not in note_info.keys() \ or note_info[guid][FONT_SIZE] != conf.font_size \ or note_info[guid][LINE_HEIGHT] != conf.line_height: adjust_note(note) note_info[guid] = {FONT_SIZE: conf.font_size, LINE_HEIGHT: conf.line_height} guids_to_forget = [guid for guid in note_info.keys() if guid not in notes_in_evernote] for guid in guids_to_forget: logging.debug("Delete guid from DB: {}".format(guid)) del note_info[guid] note_info.close()
class EmbeddingIndexer(object): def __init__(self, embedding_file, index_file): self.embedding_file = embedding_file self.index_file = index_file logger.info("Input path: " + self.embedding_file) logger.info("Index path: " + self.index_file) self.embedding = SqliteDict(os.path.join(self.index_file, EMBEDDING), autocommit=True) def iterator(self): with open(self.embedding_file) as f: for line in f: tokens = line.strip().split(" ") if len(tokens) == 2: continue uri = tokens[0] embedding = np.array(tokens[1:], dtype=np.float32) yield (uri, embedding) def run(self): count = 0 for key, value in self.iterator(): self.embedding[key] = value count += 1 if count % 20000 == 0: self.embedding.commit() logger.info("[{}] {} index added.".format( datetime.datetime.now(), count)) self.embedding.close()
def post(self): data = {"success": False} stats_dict = SqliteDict( './api_stats.sqlite', autocommit=True) # TODO: properly implement stats # output_types = ["pseudonymized", "tagged", "conll"] """Upload a file.""" # try: file = request.files['file'] #get file in the request if file and self.allowed_file(file.filename): filename = secure_filename( file.filename) #make sure we have a proper filename logger.info(f'**found {filename}') full_filename = UPLOAD_DIRECTORY / filename file.save(full_filename) #saves pdf in folder pdf2txt(full_filename) #call pdf2txt on pdf with open(full_filename.with_suffix('.txt'), 'r', encoding='utf-8') as f: output = f.read() os.remove(full_filename) os.remove(full_filename.with_suffix('.txt')) data["text"] = str(output) data["success"] = True # TODO : add treatment for tabs # TODO : add spell checks stats_dict.close() return data
def __test_irregular_tablenames(tablename): filename = ':memory:' db = SqliteDict(filename, tablename=tablename) db['key'] = 'value' db.commit() self.assertEqual(db['key'], 'value') db.close()
class CDataBase(object): def __init__(self): try: self.close() except: pass self.mydict = SqliteDict('./DB/my_db.sqlite', autocommit=True) self.show() def set(self, key, value): self.mydict[key] = value def get(self, key): if key in self.mydict.keys(): ret = self.mydict[key] else: ret = None return ret def show(self, start_with=''): for key, value in self.mydict.items(): if key.find(start_with): print(key, '\t', value, '\n') def clear(self): self.mydict.clear() def close(self): self.mydict.close()
def create_keywords_from_url(url_db): url_keywords = dict() db = SqliteDict(url_db, autocommit=False) urls = db.keys() lemmatizer = WordNetLemmatizer() for url in urls: entity = url.split('/')[-1] if "–" in entity: words = list() first_words = entity.split('_') for word in first_words: words.extend(word.split('–')) elif "-" in entity: words = list() first_words = entity.split('_') for word in first_words: words.extend(word.split('-')) else: words = entity.split('_') keywords = set() for word in words: prepro = word.strip(',').strip('.').strip('(').strip(')').lower() #keywords.add(prepro) keywords.add(lemmatizer.lemmatize(prepro)) url_keywords[url] = keywords db.close() return url_keywords
def __init__(self, filename: Path = None): self.store = filename or DataStore.DEFAULT_FILE if not self.store.parent.exists(): self.store.parent.mkdir(parents=True, exist_ok=True) sqlite = SqliteDict(self.store) sqlite.close() self.store.chmod(0o600)
def test_sqlitedict_read(datasize): d = SqliteDict(f'debug_{datasize}.sqlite') for j in range(3): for i in range(0, datasize): a = d[str(i + j * len(text))] assert a == text[i] d.close()
def lookup_knn(query, top_k=20, path_q2sig='./demo_data/q2sig.sqldict', path_sig2buckets='./demo_data/sig2buckets', path_feature_map='./demo_data/feature_map.sqldict'): candidates = SortedSet(key=lambda t: (-t[1], t[0])) with SqliteDict(path_q2sig) as db_q2sig: sigs = db_q2sig[query] fmap = SqliteDict(path_feature_map) bucket_maps = [] for path in glob.glob(f"{path_sig2buckets}*"): bucket_maps.append(SqliteDict(path)) for db_sig2buckets in bucket_maps: for sig in sigs: bucket = db_sig2buckets.get(sig, None) if bucket is None: continue for q in bucket: if q == query: continue sim = pairwise_cosine_similarity(fmap[q], fmap[query]) candidates.add((q, sim)) candidates = candidates[:top_k] fmap.close() for db_sig2buckets in bucket_maps: db_sig2buckets.close() return candidates
def _import_sql_data(data_dir): file_path = os.path.join(data_dir, DATA_FILE) # Find out what format we have with sqlite3.connect(file_path) as conn: try: conn.execute('select count(*) from zipgun_info') zipgun_info = SqliteDict(file_path, tablename='zipgun_info') version = zipgun_info.get('version', 0) except sqlite3.OperationalError: version = 0 if version == 0: country_postal_codes = SqliteDict(file_path) elif version == 1: country_postal_codes = {} for country_code in zipgun_info['country_codes']: if country_code in country_postal_codes: raise ValueError('Duplicate entry found for {}'.format( country_code)) country_postal_codes[country_code] = SqliteDict( file_path, tablename='zg_{}'.format(country_code), journal_mode='OFF') zipgun_info.close() else: raise ValueError('Unknown data file version {}'.format(version)) return country_postal_codes
def _import_sql_data(data_dir): import sqlite3 from sqlitedict import SqliteDict file_path = os.path.join(data_dir, DATA_FILE) # Find out what format we have with sqlite3.connect(file_path) as conn: try: conn.execute('select count(*) from zipgun_info') zipgun_info = SqliteDict(file_path, tablename='zipgun_info') version = zipgun_info.get('version', 0) except sqlite3.OperationalError: version = 0 if version == 0: country_postal_codes = SqliteDict(file_path) elif version == 1: country_postal_codes = {} for country_code in zipgun_info['country_codes']: if country_code in country_postal_codes: raise ValueError( 'Duplicate entry found for {}'.format(country_code)) country_postal_codes[country_code] = SqliteDict( file_path, tablename='zg_{}'.format(country_code), journal_mode='OFF') zipgun_info.close() else: raise ValueError('Unknown data file version {}'.format(version)) return country_postal_codes
def clear_db(db_path_shadow: str) -> None: doc_vecs_db = SqliteDict(db_path_shadow) print("Clearing db {}".format(db_path_shadow)) for key in tqdm(doc_vecs_db.keys()): del doc_vecs_db[key] doc_vecs_db.commit() doc_vecs_db.close()
def test_readonly(self): fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname) orig_db['key'] = 'value' orig_db['key_two'] = 2 orig_db.commit() orig_db.close() readonly_db = SqliteDict(filename=fname, flag='r') self.assertTrue(readonly_db['key'] == 'value') self.assertTrue(readonly_db['key_two'] == 2) def attempt_write(): readonly_db['key'] = ['new_value'] def attempt_update(): readonly_db.update(key='value2', key_two=2.1) def attempt_delete(): del readonly_db['key'] def attempt_clear(): readonly_db.clear() def attempt_terminate(): readonly_db.terminate() attempt_funcs = [ attempt_write, attempt_update, attempt_delete, attempt_clear, attempt_terminate ] for func in attempt_funcs: with self.assertRaises(RuntimeError): func()
def persistence_load(db_path=config.WN_FEATURE_CACHE_PATH): p_dict = { 'hypernym_stems_dict': dict(), 'hyponym_stems_dict': dict(), 'hyper_lvl_dict': dict(), 'hypo_lvl_dict': dict(), 'ant_dict': dict(), 'em_lemmas_dict': dict(), } # if em_dict: # p_dict['em_dict'] = dict() for dict_name in p_dict.keys(): print("Loading Persistent WN Feature Dict:", dict_name) if dict_name != 'em_dict': in_db_dict = SqliteDict(str(db_path / dict_name), autocommit=False, tablename='the_table', flag='c') for key, v in tqdm(in_db_dict.items()): p_dict[dict_name][key] = v in_db_dict.close() elif dict_name == 'em_dict': in_db_dict = SqliteDict(str(db_path / dict_name), autocommit=False, tablename='the_table', flag='c') for key, v in tqdm(in_db_dict.items()): p_dict[dict_name][key] = v in_db_dict.close() return p_dict
def search(self, obj, exact=False, db=None): """ Search the database for partial matches of [obj], and return a list of matches in the tuple form: ("obj", { "filename_hash": string, "cryptographer": string, "key": string, "storage_provider": string, "bucket": string, "file_hash": string } ) If [exact] == True, then only exact matches will be returned. Since there should only ever be a single exact match for a path in the DB, a CstashCriticalException will be thrown if more than a single element is in the resulting list. This shouldn't be possible anyway, since the DB is a key/value store, but it's a safety measure. """ db = db or self.db db_connection = SqliteDict(db, autocommit=True, flag='r') if exact is True: keys = [(k, db_connection[k]) for k in db_connection.keys() if obj == k] elif obj is None: keys = [(k, db_connection[k]) for k in db_connection.keys()] else: keys = [(k, db_connection[k]) for k in db_connection.keys() if obj in k] if exact is True and len(keys) > 1: raise exceptions.CstashCriticalException(message=(f"Found more than a single match " "for {obj} in the database:\n\n{keys}")) # pylint: disable=bad-continuation db_connection.close() return keys
def assertIterationDataRecorded(self, expected, tolerance, root): if self.comm.rank != 0: return db = SqliteDict(self.filename, self.tablename_iterations) _assertIterationDataRecorded(self, db, expected, tolerance) db.close()
class SqliteDictDupesFilter(object): def __init__(self): """ SqlteDict based dupes filter """ self.dupes_db_file = tempfile.mktemp() self.__filter = None def __create_db(self): self.__filter = SqliteDict(self.dupes_db_file, flag='n', autocommit=True) def __contains__(self, element): if self.__filter is None: self.__create_db() return element in self.__filter def add(self, element): if self.__filter is None: self.__create_db() self.__filter[element] = '-' def close(self): if self.__filter is not None: try: self.__filter.close() os.remove(self.dupes_db_file) except: pass
def sqliteFileIO(self, aKey, Val=None): # distinguish between reading and writing if Val is None: #is reading key_dict = SqliteDict(self.dict_sqlite, autocommit=True) global card_data try: card_data = key_dict[aKey] self.ids._bios.reg.text = '{}'.format(f'{card_data[0]}') self.ids._bios.serial_no.text = '{}'.format(f'{card_data[1]}') self.ids._bios.phone_no.text = '{}'.format(f'{card_data[2]}') self.ids._bios.card.text = '{}'.format(f'{card_data[-1]}') self.ids._status.stat.text = "[INFO] Verification complete" # remove the key value pair del key_dict[str(aKey)] #reset the status except KeyError: self.ids._scanner2.scan2.text = f"[{aKey}] Card unrecognized(Scan Key)" key_dict.close() else: #is writing key_dict = SqliteDict(self.dict_sqlite, autocommit=True) try: key_dict[aKey] = Val except KeyError: self.ids._scanner1.scan1.text = f"[{aKey}] Key already scanned" key_dict.close()
def assertMetadataRecorded(self, expected): if self.comm.rank != 0: return db = SqliteDict(self.filename, self.tablename) _assertMetadataRecorded(self, db, expected) db.close()
def get_account(server: Guild, member: Member) -> int: """Return the account level for a given user. Intended for export to Cogs. """ uid = str(member.id) sid = str(server.id) # Get a temporary instance of the main database database = SqliteDict( filename=f"db/{inst.database}", tablename="discord-bot", encode=json.dumps, decode=json.loads ) if "accounts" not in database: database.close() raise KeyError("Database not initialized.") db_dict = database["accounts"] database.close() if sid not in db_dict: raise KeyError("Server has no accounts.") if uid not in db_dict[sid]: raise KeyError("User does not have an account for this server.") else: return db_dict[sid][uid]
def solve_in_mem(self): database = {} self.solveAll(database) sqldb = SqliteDict(self.dbName, autocommit=True) for k, v in database.items(): sqldb[k] = v.simplify() sqldb.close() print("[Solver] Graph saved to disk")
class SessionState: @staticmethod def my_encode(obj): try: if isinstance(obj, list) and obj and isinstance( obj[0], RevComment): enc_obj = [v.encode() for v in obj] else: enc_obj = obj return dumps(enc_obj, ensure_ascii=False) except TypeError: return sqlite3.Binary( zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL))) @staticmethod def my_decode(obj): try: obj = loads(obj) except ValueError: return pickle.loads(zlib.decompress(bytes(obj))) if isinstance(obj, list) and obj and isinstance( obj[0], dict) and 'comment' in obj[0]: obj = [RevComment.decode(v) for v in obj] return obj def __init__(self, cache_file: str, user_requested=False): self.user_requested = user_requested self.cache = SqliteDict(cache_file, autocommit=True, encode=self.my_encode, decode=self.my_decode) self.session = Session() # noinspection PyTypeChecker self.session.mount( 'https://', HTTPAdapter( max_retries=Retry(total=3, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))) self.sites = {} def __enter__(self): return self def __exit__(self, typ, value, traceback): self.cache.close() self.session.close() def get_site(self, domain: str) -> WikiSite: try: return self.sites[domain] except KeyError: # noinspection PyTypeChecker site = WikiSite(domain, self.session, domain == primary_domain) if self.user_requested: site.maxlag = None self.sites[domain] = site return site
def get_doc(doc_id): doc_db = SqliteDict(os.path.join(db_path, "docs.db")) try: document = doc_db[doc_id] except KeyError: doc_db.close() return Response(status=404) doc_db.close() return jsonify(document)
def test_as_str(self): """Verify SqliteDict.__str__().""" # given, db = SqliteDict() # exercise db.__str__() # test when db closed db.close() db.__str__()
def process_coins(): coins_db = SqliteDict('./coins.db', autocommit=True) data = coins_db['coin_data'] coins_db.close() all_coins = [] for coin in data: coin["isover40"] = calculate_coin(coin['price_usd']) all_coins.append(coin) return all_coins
def return_all_entries(self, db=None): """ Return a dict of the database """ db = db or self.db db_connection = SqliteDict(db, autocommit=True, flag='r') entries = dict(db_connection) db_connection.close() return entries
def set_query_results(query, k=20): query_map = SqliteDict(query_map_path) query_vec = query_map[query] query_map.close() query_db = SqliteDict(query_db_path) results = search(docs_tfidf, query_vec, hw2.cosine_sim) query_db[query] = results[:k] query_db.commit() query_db.close()
def add_docs_to_query_vector(query_vector: BagOfWordsVector, docs: list[int], alpha: float) -> BagOfWordsVector: doc_db = SqliteDict(doc_vecs_db_path) for doc_id in docs: doc_vector = try_to_get_doc_vector_from_db(doc_id, doc_db) query_vector = add_vectors(query_vector, scalar_multiply(doc_vector, alpha)) doc_db.close() return query_vector
def try_to_get_query_from_db(q: str) -> BagOfWordsVector: query_map = SqliteDict(query_map_path) try: query_vector = query_map[q] query_map.close() return query_vector except KeyError: query_map.close() raise HTTPException
def criaTeste(): mydict = SqliteDict('db/teste.sqlite', "mails", autocommit=True) mydict['1'] = "batata" mydict['2'] = "banana" mydict['3'] = "oi" mydict['4'] = "teste" for key, value in mydict.iteritems(): print(key, value) print(len(mydict)) # etc... all dict functions work mydict.close()
def process_coins(): coins_db = SqliteDict('/home/phennaux115/Documents/Python-Flask/coins.db', autocommit=True) data = coins_db["coin_data"] coins_db.close() all_coins = [] for coin in data: coin["isover40"] = calculate_coin(coin["price_usd"]) all_coins.append(coin) return all_coins
def test_default_reuse_existing_flag_c(self): """Re-opening of a database does not destroy it.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname) orig_db['key'] = 'value' orig_db.commit() orig_db.close() next_db = SqliteDict(filename=fname) self.assertIn('key', next_db.keys()) self.assertEqual(next_db['key'], 'value')
def test_overwrite_using_flag_n(self): """Re-opening of a database with flag='c' destroys it all.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname, tablename='sometable') orig_db['key'] = 'value' orig_db.commit() orig_db.close() # verify, next_db = SqliteDict(filename=fname, tablename='sometable', flag='n') self.assertNotIn('key', next_db.keys())
def basic_usage(): """SqliteDict引擎会将任意的value转换为 """ mydict = SqliteDict("test.sqlite", autocommit=True) mydict["integer_value"] = 1 mydict["real_value"] = 2.2 mydict["text_value"] = "abc" mydict["date_value"] = date.today() mydict["datetime_value"] = datetime.now() # if you don't use with SqliteDict("test.sqlite") as mydict: ... # you have to close the connection explicitly mydict.close()
def assertDatasetEquals(self, expected, tolerance): # Close the file to ensure it is written to disk. self.recorder.close() # self.recorder.out = None sentinel = object() db = SqliteDict( self.filename, self.tablename ) for coord, expect in expected: iter_coord = format_iteration_coordinate(coord) groupings = ( ("Parameters", expect[0]), ("Unknowns", expect[1]), ("Residuals", expect[2]) ) #### Need to get the record with the key of 'iter_coord' actual_group = db[iter_coord] timestamp = actual_group['timestamp'] self.assertTrue(self.t0 <= timestamp and timestamp <= self.t1 ) for label, values in groupings: actual = actual_group[label] # If len(actual) == len(expected) and actual <= expected, then # actual == expected. self.assertEqual(len(actual), len(values)) for key, val in values: found_val = actual.get(key, sentinel) if found_val is sentinel: self.fail("Did not find key '{0}'".format(key)) if isinstance(found_val, _ByObjWrapper): found_val = found_val.val try: assert_rel_error(self, found_val, val, tolerance) except TypeError as error: self.assertEqual(found_val, val) del db[iter_coord] ######## delete the record with the key 'iter_coord' # Having deleted all found values, the file should now be empty. ###### Need a way to get the number of records in the main table self.assertEqual(len(db), 0) db.close()
def assertDatasetEquals(self, expected, tolerance): # Close the file to ensure it is written to disk. self.recorder.close() # self.recorder.out = None sentinel = object() db = SqliteDict( self.filename, self.tablename ) ###### Need a way to get a list of the group_names in the order in which they were written and put it in a variable named order order = db['order'] del db['order'] for coord, expect in expected: iter_coord = format_iteration_coordinate(coord) self.assertEqual(order.pop(0), iter_coord) groupings = ( ("Parameters", expect[0]), ("Unknowns", expect[1]), ("Residuals", expect[2]) ) #### Need to get the record with the key of 'iter_coord' actual_group = db[iter_coord] for label, values in groupings: actual = actual_group[label] # If len(actual) == len(expected) and actual <= expected, then # actual == expected. self.assertEqual(len(actual), len(values)) for key, val in values: found_val = actual.get(key, sentinel) if found_val is sentinel: self.fail("Did not find key '{0}'".format(key)) assert_rel_error(self, found_val, val, tolerance) del db[iter_coord] ######## delete the record with the key 'iter_coord' # Having deleted all found values, the file should now be empty. ###### Need a way to get the number of records in the main table self.assertEqual(len(db), 0) # As should the ordering. self.assertEqual(len(order), 0) db.close()
def test_irregular_tablenames(self): """Irregular table names need to be quoted""" db = SqliteDict(':memory:', tablename='9nine') db['key'] = 'value' db.commit() self.assertEqual(db['key'], 'value') db.close() db = SqliteDict(':memory:', tablename='outer space') db['key'] = 'value' db.commit() self.assertEqual(db['key'], 'value') db.close() with self.assertRaisesRegexp(ValueError, r'^Invalid tablename '): SqliteDict(':memory:', '"')
def test_driver_records_model_viewer_data(self): size = 3 prob = Problem(Group(), impl=impl) G1 = prob.root.add('G1', ParallelGroup()) G1.add('P1', IndepVarComp('x', np.ones(size, float) * 1.0)) G1.add('P2', IndepVarComp('x', np.ones(size, float) * 2.0)) prob.root.add('C1', ABCDArrayComp(size)) prob.root.connect('G1.P1.x', 'C1.a') prob.root.connect('G1.P2.x', 'C1.b') prob.driver.add_recorder(self.recorder) self.recorder.options['record_metadata'] = True prob.setup(check=False) prob.cleanup() # do some basic tests to make sure the model_viewer_data was recorded correctly if self.comm.rank == 0: db = SqliteDict(self.filename, self.tablename_metadata) model_viewer_data = db['model_viewer_data'] tr = model_viewer_data['tree'] self.assertEqual(set(['name', 'type', 'subsystem_type', 'children']), set(tr.keys())) names = [] for ch1 in tr['children']: # each is an ordereddict names.append(ch1["name"] ) for ch2 in ch1["children"]: names.append(ch2["name"] ) if "children" in ch2: for ch3 in ch2["children"]: names.append(ch3["name"] ) expected_names = ['G1', 'P1', 'x', 'P2', 'x', 'C1', 'a', 'b', 'in_string', 'in_list', 'c', 'd', 'out_string', 'out_list'] self.assertEqual( sorted(expected_names), sorted(names)) cl = model_viewer_data['connections_list'] for c in cl: self.assertEqual(set(['src', 'tgt']), set(c.keys())) db.close()
def test_recording_model_viewer_data(self): prob = Problem() prob.root = ConvergeDiverge() prob.driver.add_recorder(self.recorder) self.recorder.options["record_metadata"] = True prob.setup(check=False) prob.cleanup() # closes recorders # do some basic tests to make sure the model_viewer_data was recorded db = SqliteDict(filename=self.filename, flag="r", tablename="metadata") model_viewer_data = db["model_viewer_data"] tr = model_viewer_data["tree"] self.assertEqual(set(["name", "type", "subsystem_type", "children"]), set(tr.keys())) cl = model_viewer_data["connections_list"] for c in cl: self.assertEqual(set(["src", "tgt"]), set(c.keys())) db.close()
def test_recording_system_metadata(self): prob = Problem() prob.root = ConvergeDiverge() prob.root.add_metadata("string", "just a test") prob.root.add_metadata("ints", [1, 2, 3]) prob.driver.add_recorder(self.recorder) self.recorder.options["record_metadata"] = True prob.setup(check=False) prob.cleanup() # closes recorders # check the system metadata recording sqlite_metadata = SqliteDict(filename=self.filename, flag="r", tablename="metadata") system_metadata = sqlite_metadata["system_metadata"] self.assertEqual(len(system_metadata), 2) self.assertEqual(system_metadata["string"], "just a test") self.assertEqual(system_metadata["ints"], [1, 2, 3]) sqlite_metadata.close()
def test_mutiple_thread(): """多个进程访问数据库的时候, 最好只有一个有写操作。如果需要多个进程进行写操作, 则需要每次在 写操作后进行commit, 也就是说需要打开autocommit """ dict1 = SqliteDict("test.sqlite", autocommit=False) # if False, then mutiple thread writing is dict2 = SqliteDict("test.sqlite", autocommit=False) # not allowed print(dict1["integer_value"]) print(dict2["integer_value"]) # dict1["integer_value"] = 2 print(dict1["integer_value"], dict2["integer_value"]) # dict2["integer_value"] = 3 print(dict1["integer_value"], dict2["integer_value"]) dict1.close() dict2.close()
def _persist_v1(file_path, zg): print 'Creating meta db...' zipgun_info = SqliteDict( file_path, tablename='zipgun_info', autocommit=False) zipgun_info['version'] = 1 zipgun_info['country_codes'] = zg.country_postal_codes.keys() zipgun_info.commit() for country_code in zg.country_postal_codes: print 'Creating {} db...'.format(country_code) country_data = SqliteDict( file_path, tablename='zg_{}'.format(country_code), autocommit=False) country_data.update(zg.country_postal_codes[country_code]) country_data.commit() time.sleep(1.0) # Pretty bullshit country_data.close() zipgun_info.close()
class SqliteDictJsonSerializationTest(unittest.TestCase): def setUp(self): self.fname = norm_file('tests/db-json/sqlitedict.sqlite') self.db = SqliteDict( filename=self.fname, tablename='test', encode=json.dumps, decode=json.loads ) def tearDown(self): self.db.close() os.unlink(self.fname) os.rmdir(os.path.dirname(self.fname)) def get_json(self, key): return self.db.conn.select_one('SELECT value FROM test WHERE key = ?', (key,))[0] def test_int(self): self.db['test'] = -42 assert self.db['test'] == -42 assert self.get_json('test') == '-42' def test_str(self): test_str = u'Test \u30c6\u30b9\u30c8' self.db['test'] = test_str assert self.db['test'] == test_str assert self.get_json('test') == r'"Test \u30c6\u30b9\u30c8"' def test_bool(self): self.db['test'] = False assert self.db['test'] is False assert self.get_json('test') == 'false' def test_none(self): self.db['test'] = None assert self.db['test'] is None assert self.get_json('test') == 'null' def test_complex_struct(self): test_value = { 'version': 2.5, 'items': ['one', 'two'], } self.db['test'] = test_value assert self.db['test'] == test_value assert self.get_json('test') == json.dumps(test_value)
class Scribe(object): def __init__(self, location, table_name, exp_name): filename = "{}/scribe.sqlite".format(location) self.book = SqliteDict(filename, autocommit=True, tablename=table_name) unique_id = datetime.now().strftime("date_%m.%d_time_%H.%M") self.exp_name = exp_name+"_"+unique_id self.observation_index = 0 def record(self, value, type="general"): key = "{}; {}; {}".format(self.exp_name, self.observation_index, type) self.book[key] = value self.observation_index += 1 observe = record #sometimes i forget which def lookup(self, type=None, exp_name=None, ret_sorted=False, strip_keys=False): type_func = lambda *args: True name_func = lambda *args: True if type: type_func = lambda x: x[2] == type if exp_name: name_func = lambda x: exp_name in x[0] key_func = lambda x: type_func(x) and name_func(x) unpack = lambda x: [f(x.strip()) for f,x in zip([str,int,str],x.split(";"))] items = {k:v for k,v in self.book.iteritems() if key_func(unpack(k))} if ret_sorted: return self.sort_results(items, strip_keys) return items def sort_results(self, result_dict, only_val_return=False): unpack = lambda x: [f(x.strip()) for f,x in zip([str,int,str],x.split(";"))] ranker = lambda x: unpack(x[0])[1] sorted_items = sorted(result_dict.items(), key=ranker) if only_val_return: return [v for k,v in sorted_items] return sorted_items def close(self): self.book.close()
class FileCache(CacheInterface): def __init__(self, config): CacheInterface.__init__(self, config) self.db = SqliteDict(self.config[u"cache"][u"file"], autocommit=True) self.expiration = self.config[u"cache"].get(u"expiration", 86400) def closer(): try: self.db.close() except Exception: logger.exception("Exception closing file cache") atexit.register(closer) def get(self, key): if int(self.db[key + "_expiration"]) - time.time() <= 0: raise KeyError("cache key expired") return self.db[key] def set(self, key, val): self.db[key] = val self.db[key + "_expiration"] = str(int(time.time()) + self.expiration)
def test_overwrite_using_flag_w(self): """Re-opening of a database with flag='w' destroys only the target table.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db_1 = SqliteDict(filename=fname, tablename='one') orig_db_1['key'] = 'value' orig_db_1.commit() orig_db_1.close() orig_db_2 = SqliteDict(filename=fname, tablename='two') orig_db_2['key'] = 'value' orig_db_2.commit() orig_db_2.close() # verify, when re-opening table space 'one' with flag='2', we destroy # its contents. However, when re-opening table space 'two' with # default flag='r', its contents remain. next_db_1 = SqliteDict(filename=fname, tablename='one', flag='w') self.assertNotIn('key', next_db_1.keys()) next_db_2 = SqliteDict(filename=fname, tablename='two') self.assertIn('key', next_db_2.keys())
def test_readonly(self): fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname) orig_db['key'] = 'value' orig_db['key_two'] = 2 orig_db.commit() orig_db.close() readonly_db = SqliteDict(filename=fname, flag = 'r') self.assertTrue(readonly_db['key'] == 'value') self.assertTrue(readonly_db['key_two'] == 2) def attempt_write(): readonly_db['key'] = ['new_value'] def attempt_update(): readonly_db.update(key = 'value2', key_two = 2.1) def attempt_delete(): del readonly_db['key'] def attempt_clear(): readonly_db.clear() def attempt_terminate(): readonly_db.terminate() attempt_funcs = [attempt_write, attempt_update, attempt_delete, attempt_clear, attempt_terminate] for func in attempt_funcs: with self.assertRaises(RuntimeError): func()
def assertIterationDataRecorded(self, expected, tolerance): db = SqliteDict( self.filename, self.tablename ) _assertIterationDataRecorded(self, db, expected, tolerance) db.close()
class SimIndex(gensim.utils.SaveLoad): """ An index of documents. Used internally by SimServer. It uses the Similarity class to persist all document vectors to disk (via mmap). """ def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = {} # map document id (string) to index position (integer) self.pos2id = {} # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict(self.fname + '.id2sims', journal_mode=JOURNAL_MODE) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0 def save(self, fname): tmp, self.id2sims = self.id2sims, None super(SimIndex, self).save(fname) self.id2sims = tmp @staticmethod def load(fname): result = gensim.utils.SaveLoad.load(fname) result.fname = fname result.check_moved() result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE) return result def check_moved(self): output_prefix = self.fname + '.idx' if self.qindex.output_prefix != output_prefix: logger.info("index seems to have moved from %s to %s; updating locations" % (self.qindex.output_prefix, output_prefix)) self.qindex.output_prefix = output_prefix self.qindex.check_moved() def close(self): "Explicitly release important resources (file handles, db, ...)" try: self.id2sims.close() except: pass try: del self.qindex except: pass def terminate(self): """Delete all files created by this index, invalidating `self`. Use with care.""" try: self.id2sims.terminate() except: pass import glob for fname in glob.glob(self.fname + '*'): try: os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s: %s" % (fname, e)) for val in self.__dict__.keys(): try: delattr(self, val) except: pass
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate() # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict(journal_mode=JOURNAL_MODE) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning("too few training documents; using simple log-entropy model instead of latent semantic indexing") method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.model = None
class SqliteRecorder(BaseRecorder): """ Recorder that saves cases in an SQLite dictionary. Args ---- sqlite_dict_args : dict Dictionary lf any additional arguments for the SQL db. Options ------- options['record_metadata'] : bool(True) Tells recorder whether to record variable attribute metadata. options['record_unknowns'] : bool(True) Tells recorder whether to record the unknowns vector. options['record_params'] : bool(False) Tells recorder whether to record the params vector. options['record_resids'] : bool(False) Tells recorder whether to record the ressiduals vector. options['record_derivs'] : bool(True) Tells recorder whether to record derivatives that are requested by a `Driver`. options['includes'] : list of strings Patterns for variables to include in recording. options['excludes'] : list of strings Patterns for variables to exclude in recording (processed after includes). """ def __init__(self, out, **sqlite_dict_args): super(SqliteRecorder, self).__init__() if MPI and MPI.COMM_WORLD.rank > 0 : self._open_close_sqlitedict = False else: self._open_close_sqlitedict = True if self._open_close_sqlitedict: sqlite_dict_args.setdefault('autocommit', True) self.out = SqliteDict(filename=out, flag='n', tablename='openmdao', **sqlite_dict_args) self.out_derivs = SqliteDict(filename=out, flag='w', tablename='openmdao_derivs', **sqlite_dict_args) else: self.out = None def record_metadata(self, group): """Stores the metadata of the given group in a sqlite file using the variable name for the key. Args ---- group : `System` `System` containing vectors """ params = group.params.iteritems() #resids = group.resids.iteritems() unknowns = group.unknowns.iteritems() data = OrderedDict([ ('format_version', format_version), ('Parameters', dict(params)), ('Unknowns', dict(unknowns)), ]) self.out['metadata'] = data def record_iteration(self, params, unknowns, resids, metadata): """ Stores the provided data in the sqlite file using the iteration coordinate for the key. Args ---- params : dict Dictionary containing parameters. (p) unknowns : dict Dictionary containing outputs and states. (u) resids : dict Dictionary containing residuals. (r) metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata['coord'] timestamp = metadata['timestamp'] group_name = format_iteration_coordinate(iteration_coordinate) data['timestamp'] = timestamp data['success'] = metadata['success'] data['msg'] = metadata['msg'] if self.options['record_params']: data['Parameters'] = self._filter_vector(params, 'p', iteration_coordinate) if self.options['record_unknowns']: data['Unknowns'] = self._filter_vector(unknowns, 'u', iteration_coordinate) if self.options['record_resids']: data['Residuals'] = self._filter_vector(resids, 'r', iteration_coordinate) self.out[group_name] = data def record_derivatives(self, derivs, metadata): """Writes the derivatives that were calculated for the driver. Args ---- derivs : dict or ndarray depending on the optimizer Dictionary containing derivatives metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata['coord'] timestamp = metadata['timestamp'] group_name = format_iteration_coordinate(iteration_coordinate) data['timestamp'] = timestamp data['success'] = metadata['success'] data['msg'] = metadata['msg'] data['Derivatives'] = derivs self.out_derivs[group_name] = data def close(self): """Closes `out`""" if self._open_close_sqlitedict: if self.out is not None: self.out.close() self.out = None if self.out_derivs is not None: self.out_derivs.close() self.out_derivs = None
class SqliteRecorder(BaseRecorder): """ Recorder that saves cases in an SQLite dictionary. Args ---- sqlite_dict_args : dict Dictionary lf any additional arguments for the SQL db. Options ------- options['record_metadata'] : bool(True) Tells recorder whether to record variable attribute metadata. options['record_unknowns'] : bool(True) Tells recorder whether to record the unknowns vector. options['record_params'] : bool(False) Tells recorder whether to record the params vector. options['record_resids'] : bool(False) Tells recorder whether to record the ressiduals vector. options['record_derivs'] : bool(True) Tells recorder whether to record derivatives that are requested by a `Driver`. options['includes'] : list of strings Patterns for variables to include in recording. options['excludes'] : list of strings Patterns for variables to exclude in recording (processed after includes). """ def __init__(self, out, **sqlite_dict_args): super(SqliteRecorder, self).__init__() if MPI and MPI.COMM_WORLD.rank > 0: self._open_close_sqlitedict = False else: self._open_close_sqlitedict = True if self._open_close_sqlitedict: sqlite_dict_args.setdefault("autocommit", True) sqlite_dict_args.setdefault("tablename", "openmdao") self.out = SqliteDict(filename=out, flag="n", **sqlite_dict_args) else: self.out = None def record_metadata(self, group): """Stores the metadata of the given group in a sqlite file using the variable name for the key. Args ---- group : `System` `System` containing vectors """ params = group.params.iteritems() resids = group.resids.iteritems() unknowns = group.unknowns.iteritems() data = OrderedDict([("Parameters", dict(params)), ("Unknowns", dict(unknowns))]) self.out["metadata"] = data def record_iteration(self, params, unknowns, resids, metadata): """ Stores the provided data in the sqlite file using the iteration coordinate for the key. Args ---- params : dict Dictionary containing parameters. (p) unknowns : dict Dictionary containing outputs and states. (u) resids : dict Dictionary containing residuals. (r) metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata["coord"] timestamp = metadata["timestamp"] group_name = format_iteration_coordinate(iteration_coordinate) data["timestamp"] = timestamp data["success"] = metadata["success"] data["msg"] = metadata["msg"] if self.options["record_params"]: data["Parameters"] = self._filter_vector(params, "p", iteration_coordinate) if self.options["record_unknowns"]: data["Unknowns"] = self._filter_vector(unknowns, "u", iteration_coordinate) if self.options["record_resids"]: data["Residuals"] = self._filter_vector(resids, "r", iteration_coordinate) self.out[group_name] = data def record_derivatives(self, derivs, metadata): """Writes the derivatives that were calculated for the driver. Args ---- derivs : dict Dictionary containing derivatives metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata["coord"] timestamp = metadata["timestamp"] group_name = format_iteration_coordinate(iteration_coordinate) group_name = "%s/derivs" % group_name data["timestamp"] = timestamp data["success"] = metadata["success"] data["msg"] = metadata["msg"] data["Derivatives"] = derivs self.out[group_name] = data def close(self): """Closes `out`""" if self._open_close_sqlitedict: if self.out is not None: self.out.close() self.out = None
class IMAPMailbox(ExtendedMaildir): implements(imap4.IMailbox, imap4.ICloseableMailbox) AppendFactory = SerpentAppendMessageTask def __init__(self, path): maildir.initializeMaildir(path) self.listeners = [] self.path = path self.open_flags() self.lastadded = None self.__check_flags_() def open_flags(self): self.msg_info = SqliteDict(os.path.join(self.path, conf.imap_msg_info)) self.mbox_info = SqliteDict(os.path.join(self.path, conf.imap_mbox_info)) def _start_monitor(self): self.notifier = inotify.INotify() self.notifier.startReading() self.notifier.watch(filepath.FilePath(os.path.join(self.path, 'new')), callbacks=[self._new_files]) self.notifier.watch(filepath.FilePath(os.path.join(self.path,'cur')), callbacks=[self._new_files]) def _stop_monitor(self): self.notifier.stopReading() self.notifier.loseConnection() def _new_files(self, wo, path, code): if code == inotify.IN_MOVED_TO or code == inotify.IN_DELETE: for l in self.listeners: l.newMessages(self.getMessageCount(), self.getRecentCount()) def __check_flags_(self): if 'subscribed' not in self.mbox_info.keys(): self.mbox_info['subscribed'] = False if 'flags' not in self.mbox_info.keys(): self.mbox_info['flags'] = [] if 'special' not in self.mbox_info.keys(): self.mbox_info['special'] = '' if 'uidvalidity' not in self.mbox_info.keys(): self.mbox_info['uidvalidity'] = random.randint(0, 2**32) if 'uidnext' not in self.mbox_info.keys(): self.mbox_info['uidnext'] = 1 #self.mbox_info.commit(blocking=False) # XXX l = [l for l in self.__msg_list_()] for i in l: fn = i.split('/')[-1] if fn not in self.msg_info.keys(): val1 = {'uid': self.getUIDNext()} if i.split('/')[-2] == 'new': val1['flags'] = [] else: val1['flags'] = [misc.IMAP_FLAGS['SEEN']] self.msg_info[fn] = val1 #self.msg_info.commit(blocking=False) # XXX def subscribe(self): self.mbox_info['subscribed'] = True #self.mbox_info.commit(blocking=False) # XXX def unsubscribe(self): self.mbox_info['subscribed'] = False #self.mbox_info.commit(blocking=False) # XXX def is_subscribed(self): return self.mbox_info['subscribed'] def __count_flagged_msgs_(self, flag): val1 = [0 for fn in self.msg_info.keys() if flag in self.msg_info[fn]['flags']] return len(val1) def getHierarchicalDelimiter(self): return misc.IMAP_HDELIM def setSpecial(self, special): self.mbox_info['special'] = special #self.mbox_info.commit(blocking=False) # XXX def getFlags(self): return sorted(misc.IMAP_FLAGS.values()) def getMboxFlags(self): f = list(self.mbox_info['flags']) if self.mbox_info['special'] != '': f.append(self.mbox_info['special']) return f def addFlag(self, flag): self.mbox_info['flags'] = list(set(self.mbox_info['flags']).union([flag])) #self.mbox_info.commit(blocking=False) # XXX def removeFlag(self, flag): self.mbox_info['flags'] = list(set(self.mbox_info['flags']).difference([flag])) #self.mbox_info.commit(blocking=False) # XXX def hasChildren(self): flags = self.getFlags() if misc.MBOX_FLAGS['HASCHILDREN'] not in flags: self.addFlag(misc.MBOX_FLAGS['HASCHILDREN']) if misc.MBOX_FLAGS['HASNOCHILDREN'] in flags: self.removeFlag(misc.MBOX_FLAGS['HASNOCHILDREN']) def hasNoChildren(self): flags = self.getFlags() if misc.MBOX_FLAGS['HASNOCHILDREN'] not in flags: self.addFlag(misc.MBOX_FLAGS['HASNOCHILDREN']) if misc.MBOX_FLAGS['HASCHILDREN'] in flags: self.removeFlag(misc.MBOX_FLAGS['HASCHILDREN']) def getMessageCount(self): val1 = [0 for fn in self.msg_info.keys() if misc.IMAP_FLAGS['DELETED'] not in self.msg_info[fn]['flags']] return len(val1) def getRecentCount(self): c = 0 for fn in self.msg_info.keys(): if misc.IMAP_FLAGS['RECENT'] in self.msg_info[fn]['flags']: c += 1 info = self.msg_info[fn] info['flags'] = set(info['flags']).difference(set([misc.IMAP_FLAGS['RECENT']])) self.msg_info[fn] = info #self.msg_info.commit(blocking=False) # XXX return c def getUnseenCount(self): return self.getMessageCount() - self.__count_flagged_msgs_(misc.IMAP_FLAGS['SEEN']) def isWriteable(self): return True def getUIDValidity(self): return self.mbox_info['uidvalidity'] def getUIDNext(self): un = self.mbox_info['uidnext'] self.mbox_info['uidnext'] += 1 #self.mbox_info.commit(blocking=False) # XXX return un def getUID(self, num): return num def addMessage(self, message, flags = (), date = None): return self.appendMessage(message).addCallback(self._cbAddMessage, flags) def _cbAddMessage(self, obj, flags): path = self.lastadded self.lastadded = None fn = path.split('/')[-1] self.msg_info[fn] = {'uid': self.getUIDNext(), 'flags': flags} #self.msg_info.commit(blocking=False) # XXX if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur': new_path = os.path.join(self.path, 'cur', fn) os.rename(path, new_path) def __msg_list_(self): a = [] for m in os.listdir(os.path.join(self.path, 'new')): a.append(os.path.join(self.path, 'new', m)) for m in os.listdir(os.path.join(self.path, 'cur')): a.append(os.path.join(self.path, 'cur', m)) return a def _seqMessageSetToSeqDict(self, messageSet): if not messageSet.last: messageSet.last = self.getMessageCount() seqMap = {} msgs = self.__msg_list_() for messageNum in messageSet: if messageNum > 0 and messageNum <= self.getMessageCount(): seqMap[messageNum] = msgs[messageNum - 1] return seqMap def fetch(self, messages, uid): return [[seq, MaildirMessage(seq, file(filename, 'rb').read(), self.msg_info[filename.split('/')[-1]]['flags'], rfc822date())] for seq, filename in self.__fetch_(messages, uid).iteritems()] def __fetch_(self, messages, uid): if uid: messagesToFetch = {} if not messages.last: messages.last = self.mbox_info['uidnext'] fn_uid = dict((fn, self.msg_info[fn]['uid']) for fn in self.msg_info.keys()) for uid in messages: if uid in fn_uid.values(): for name, _id in fn_uid.iteritems(): if uid == _id: if os.path.exists(os.path.join(self.path,'new', name)): messagesToFetch[uid] = os.path.join(self.path,'new', name) elif os.path.exists(os.path.join(self.path,'cur', name)): messagesToFetch[uid] = os.path.join(self.path,'cur', name) else: messagesToFetch = self._seqMessageSetToSeqDict(messages) return messagesToFetch def store(self, messages, flags, mode, uid): d = {} for _id, path in self.__fetch_(messages, uid).iteritems(): filename = path.split('/')[-1] if mode < 0: old_f = self.msg_info[filename] old_f['flags'] = list(set(old_f['flags']).difference(set(flags))) self.msg_info[filename] = old_f if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'new': new_path = os.path.join(self.path, 'new', filename) os.rename(path, new_path) elif mode == 0: old_f = self.msg_info[filename] old_f['flags'] = flags self.msg_info[filename] = old_f elif mode > 0: old_f = self.msg_info[filename] old_f['flags'] = list(set(old_f['flags']).union(set(flags))) self.msg_info[filename] = old_f if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur': new_path = os.path.join(self.path, 'cur', filename) os.rename(path, new_path) d[_id] = self.msg_info[filename]['flags'] #self.msg_info.commit(blocking=False) # XXX return d def expunge(self): uids = [] for path in self.__msg_list_(): fn = path.split('/')[-1] if fn not in self.msg_info.keys(): continue uid = self.msg_info[fn]['uid'] if misc.IMAP_FLAGS['DELETED'] in self.msg_info[fn]['flags']: os.remove(path) del self.msg_info[fn] uids.append(uid) #self.msg_info.commit(blocking=False) # XXX return uids def addListener(self, listener): self.listeners.append(listener) return True def removeListener(self, listener): self.listeners.remove(listener) return True def requestStatus(self, names): return imap4.statusRequestHelper(self, names) def destroy(self): pass def close(self): print('!!! %s - %d !!!' % (self.path, len(self.listeners))) if len(self.listeners) == 0: self._stop_monitor() if conf.imap_expunge_on_close: self.expunge() self.msg_info.commit(blocking=False) self.mbox_info.commit(blocking = False) self.msg_info.close() self.mbox_info.close()
class SqliteRecorder(BaseRecorder): def __init__(self, out, **sqlite_dict_args): super(SqliteRecorder, self).__init__() if MPI and MPI.COMM_WORLD.rank > 0 : self._open_close_sqlitedict = False else: self._open_close_sqlitedict = True if self._open_close_sqlitedict: sqlite_dict_args.setdefault('autocommit', True) sqlite_dict_args.setdefault('tablename', 'openmdao') self.out = SqliteDict(filename=out, flag='n', **sqlite_dict_args) else: self.out = None def record_metadata(self, group): """Stores the metadata of the given group in a sqlite file using the variable name for the key. Args ---- group : `System` `System` containing vectors """ params = group.params.iteritems() resids = group.resids.iteritems() unknowns = group.unknowns.iteritems() data = OrderedDict([('Parameters', dict(params)), ('Unknowns', dict(unknowns)), ]) self.out['metadata'] = data def record_iteration(self, params, unknowns, resids, metadata): """ Stores the provided data in the sqlite file using the iteration coordinate for the key. Args ---- params : dict Dictionary containing parameters. (p) unknowns : dict Dictionary containing outputs and states. (u) resids : dict Dictionary containing residuals. (r) metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata['coord'] timestamp = metadata['timestamp'] params, unknowns, resids = self._filter_vectors(params, unknowns, resids, iteration_coordinate) group_name = format_iteration_coordinate(iteration_coordinate) data['timestamp'] = timestamp if self.options['record_params']: data['Parameters'] = params if self.options['record_unknowns']: data['Unknowns'] = unknowns if self.options['record_resids']: data['Residuals'] = resids self.out[group_name] = data def close(self): """Closes `out`""" if self._open_close_sqlitedict: if self.out is not None: self.out.close() self.out = None
def reset(texts, index_dic=True, tfidf=True, hdp=False, lda=True, sim=False): total_start = timeit.default_timer() make_index_time = 0 make_dict_time = 0 make_lda_time = 0 make_tfidf_time = 0 sim_time = 0 hdptopicnum = 0 if index_dic: f = [i.split(',') for i in texts.readlines()] logging.info('Create id & ac_id list') ids = [f[i][1] for i in range(len(f))] ac_ids = [f[i][0] for i in range(len(f))] logging.info('Create contents list') contents = [] for i in range(len(f)): if len(f[i]) == 3: contents.append(f[i][2].strip().split(':')) else: contents.append([]) # make index logging.info('***********Now Make Index by sqlitedict***********') timer_start = timeit.default_timer() pos2paid = zip(range(len(f)), ac_ids) paid2pos_rel = {} for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)): paid2pos_rel.update({int(key): [i[0] for i in paid]}) id2pos_rel = dict(zip(ids, range(len(f)))) pos2id_rel = dict(zip(range(len(f)), ids)) id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True) id2pos.clear() id2pos.update(id2pos_rel) id2pos.close() pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True) pos2id.clear() pos2id.update(pos2id_rel) pos2id.close() paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True) paid2pos.clear() paid2pos.update(paid2pos_rel) paid2pos.close() timer_end = timeit.default_timer() make_index_time = timer_end - timer_start # make dict logging.info('***********Now Make Dictionary***********') timer_start = timeit.default_timer() dic = corpora.Dictionary(contents) ############## optimized dictionary dic.filter_extremes(no_below=20, no_above=0.1, keep_n=None) ############## dic.save(gl.res + '/resource/dict') timer_end = timeit.default_timer() make_dict_time = timer_end - timer_start # make corpus logging.info('***********Now Make Corpus***********') temps = [] for i, t in enumerate(contents): temps.append(dic.doc2bow(t)) if i % 10000 == 0: logging.info('make corpus ' + str(i) + ' articles') corpus = temps corpora.MmCorpus.serialize(gl.res + '/resource/corpus', corpus) if tfidf: # do tfidf train logging.info('***********Now Training TF-IDF Model***********') timer_start = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') tfidf = models.TfidfModel(corpus) tfidf.save(gl.res + '/resource/tfidf') timer_end = timeit.default_timer() make_tfidf_time = timer_end - timer_start if hdp: gc.collect() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') dic = corpora.Dictionary.load(gl.res + '/resource/dict') hdpmodel = models.hdpmodel.HdpModel(corpus, id2word=dic) hdptopicnum = len(hdpmodel.print_topics(topics=-1, topn=10)) logging.info('hdptopicnum is {}'.format(hdptopicnum)) if lda: # do lda train gc.collect() tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf') corpus = corpora.MmCorpus(gl.res + '/resource/corpus') dic = corpora.Dictionary.load(gl.res + '/resource/dict') corpus_tfidf = tfidf[corpus] logging.info('***********Now Training LDA Model***********') timer_start = timeit.default_timer() if not hdptopicnum == 0: gl.topicCount = hdptopicnum lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes) # lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, # num_topics=gl.topicCount, passes=gl.lda_passes, distributed=True) lda.save(gl.res + '/resource/lda') timer_end = timeit.default_timer() make_lda_time = timer_end - timer_start logging.info('lda training cost %.2f seconds' % make_lda_time) if sim: gc.collect() logging.info('***********Now Make Similarity Index***********') st = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') lda = models.LdaModel.load(gl.res + '/resource/lda') index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount) index.save(gl.res + '/resource/simIndex') sim_time = timeit.default_timer() - st total_end = timeit.default_timer() total_time = total_end - total_start m = divmod(total_time, 60) h = divmod(m[0], 60) logging.info('\nReset LDA Model complete!!!\n' '***Using time*** \n' 'index training {:.2f}\n' 'dict training {:.2f}\n' 'tfidf training {:.2f}\n' 'lda training {:.2f}\n' 'sim training {:.2f}\n' 'Total time: {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time, make_lda_time, sim_time, int(h[0]), int(h[1]), m[1])) basicConfig = open(gl.res + '/resource/basicConfig.txt', mode='w+') basicConfig.write('FileName: {}' '\nTopicNumber = {}' '\nestTopicNumber = {}' '\nldaPasses = {}' .format(os.path.basename(texts.name), gl.topicCount, hdptopicnum, gl.lda_passes)) basicConfig.close()
def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False): total_start = timeit.default_timer() make_index_time = 0 make_dict_time = 0 make_lda_time = 0 make_tfidf_time = 0 sim_time = 0 if index_dic: f = [i.split(',') for i in texts.readlines()] logging.info('Create id & ac_id list') ids = [f[i][0] for i in range(len(f))] ac_ids = [f[i][1] for i in range(len(f))] logging.info('Create contents list') contents = [] for i in range(len(f)): if len(f[i]) == 3: contents.append(f[i][2].strip().split(':')) else: contents.append([]) # make index logging.info('***********Now merge index by sqlitedict***********') timer_start = timeit.default_timer() old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus')) pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids) paid2pos_new = {} for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)): paid2pos_new.update({int(key): [i[0] for i in paid]}) id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f)))) pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids)) id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True) id2pos.update(id2pos_new) id2pos.close() pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True) pos2id.update(pos2id_new) pos2id.close() paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True) x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])] for i in list(set.intersection(*x)): # update duplicate key temp = list(chain(paid2pos[i], paid2pos_new[i])) paid2pos.update({int(i): temp}) paid2pos.close() timer_end = timeit.default_timer() make_index_time = timer_end - timer_start # Merge dictionary logging.info('***********Now merge Dictionary***********') timer_start = timeit.default_timer() newDict = corpora.Dictionary(contents) newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None) dic = corpora.Dictionary.load(gl.res + '/resource/dict') dic.merge_with(newDict) dic.save(gl.res + '/resource/dict') timer_end = timeit.default_timer() make_dict_time = timer_end - timer_start # merge corpus logging.info('***********Now merge Corpus***********') temps = [] for i, t in enumerate(contents): temps.append(dic.doc2bow(t)) if i % 10000 == 0: logging.info('make corpus ' + str(i) + ' articles') corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps) gc.collect() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c') merged_corpus = chain(corpus, new_corpus) corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus) # Overwrite corpus for filename in glob.glob(gl.res + '/resource/*'): if filename.endswith('corpus') or filename.endswith('corpus.index') \ or filename.endswith('new_c') or filename.endswith('new_c.index'): # rm useless corpus # os.remove(filename) os.unlink(filename) if filename.endswith('merged_c'): # rename to corpus os.rename(filename, gl.res + '/resource/corpus') if filename.endswith('merged_c.index'): os.rename(filename, gl.res + '/resource/corpus.index') if tfidf: # do tfidf merge gc.collect() logging.info('***********Now merge TF-IDF model***********') timer_start = timeit.default_timer() for filename in glob.glob(gl.res + '/resource/*'): # backup old model if filename.endswith('tfidf'): os.rename(filename, filename + '_' + gl.c_time) corpus = corpora.MmCorpus(gl.res + '/resource/corpus') # reload corpus tfidf = models.TfidfModel(corpus) tfidf.save(gl.res + '/resource/tfidf') timer_end = timeit.default_timer() make_tfidf_time = timer_end - timer_start if lda: # do lda merge gc.collect() tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf') corpus = corpora.MmCorpus(gl.res + '/resource/corpus') corpus_tfidf = tfidf[corpus] dic = corpora.Dictionary.load(gl.res + '/resource/dict') logging.info('***********Now merge LDA model***********') timer_start = timeit.default_timer() for filename in glob.glob(gl.res + '/resource/*'): # backup old model if filename.endswith('lda') or filename.endswith('lda.state'): os.rename(filename, filename + '_' + gl.c_time) # lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, # num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes) lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, num_topics=gl.topicCount, passes=gl.lda_passes) lda.save(gl.res + '/resource/lda') timer_end = timeit.default_timer() make_lda_time = timer_end - timer_start logging.info('lda training cost %.2f seconds' % make_lda_time) if sim: gc.collect() logging.info('***********Now Make Similarity Index***********') st = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') lda = models.LdaModel.load(gl.res + '/resource/lda') index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount) index.save(gl.res + '/resource/simIndex') sim_time = timeit.default_timer() - st total_end = timeit.default_timer() total_time = total_end - total_start m = divmod(total_time, 60) h = divmod(m[0], 60) logging.info('\nMerge LDA Model complete!!!\n' '***Using time*** \n' 'index training {:.2f}\n' 'dict training {:.2f}\n' 'tfidf training {:.2f}\n' 'lda training {:.2f}\n' 'sim training {:.2f}\n' 'Total time: {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time, make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))