def assertMetadataRecorded(self, expected): if self.comm.rank != 0: return db = SqliteDict(self.filename, self.tablename_metadata) _assertMetadataRecorded(self, db, expected) db.close()
def adjust_evernote_font(): """ Call for Evernote """ note_info = SqliteDict(conf.db.db_file, autocommit=True) notes_in_evernote = list() for note in get_notes(get_notebooks()): guid = note.guid notes_in_evernote.append(guid) if guid not in note_info.keys() \ or note_info[guid][FONT_SIZE] != conf.font_size \ or note_info[guid][LINE_HEIGHT] != conf.line_height: adjust_note(note) note_info[guid] = {FONT_SIZE: conf.font_size, LINE_HEIGHT: conf.line_height} guids_to_forget = [guid for guid in note_info.keys() if guid not in notes_in_evernote] for guid in guids_to_forget: logging.debug("Delete guid from DB: {}".format(guid)) del note_info[guid] note_info.close()
def __init__(self, bucket_name, storage_path=None): ''' Bucker init - if the bucket exists, meta parameter will be ignored ''' if bucket_name and isinstance(bucket_name, (str, unicode)) and re.match(r"^[a-z0-9\.\-_]+$", bucket_name, re.I): self._name = bucket_name.strip() else: raise falcon.HTTPInvalidParam( "The parameter shall contain only alpha-numeric characters, value: '%s'" % bucket_name, param_name='name' ) self._bucket_path = None if storage_path and os.path.exists(storage_path): self._bucket_path = os.path.join(storage_path, self._name) else: raise falcon.HTTPInternalServerError( title='IncorrectStoragePath', description='The storage path is incorrect, "%s"' % storage_path ) if self._bucket_path and os.path.exists(self._bucket_path): self._meta = SqliteDict(os.path.join(self._bucket_path,'metadata.sqlite'), 'bucket', autocommit=True) else: self._meta = SqliteDict(':memory:', 'bucket', autocommit=True)
def _import_sql_data(data_dir): file_path = os.path.join(data_dir, DATA_FILE) # Find out what format we have with sqlite3.connect(file_path) as conn: try: conn.execute('select count(*) from zipgun_info') zipgun_info = SqliteDict(file_path, tablename='zipgun_info') version = zipgun_info.get('version', 0) except sqlite3.OperationalError: version = 0 if version == 0: country_postal_codes = SqliteDict(file_path) elif version == 1: country_postal_codes = {} for country_code in zipgun_info['country_codes']: if country_code in country_postal_codes: raise ValueError('Duplicate entry found for {}'.format( country_code)) country_postal_codes[country_code] = SqliteDict( file_path, tablename='zg_{}'.format(country_code), journal_mode='OFF') zipgun_info.close() else: raise ValueError('Unknown data file version {}'.format(version)) return country_postal_codes
def _persist_v0(file_path, zg): print 'Creating db...' persisted = SqliteDict(file_path, autocommit=False) print 'Updating data...' persisted.update(zg.country_postal_codes) print 'Committing data...' persisted.commit()
def assertIterationDataRecorded(self, expected, tolerance, root): if self.comm.rank != 0: return db = SqliteDict(self.filename, self.tablename_iterations) _assertIterationDataRecorded(self, db, expected, tolerance) db.close()
def main(data_dir): print 'Loading data...' zg = Zipgun(data_dir, force_text=True) print 'Creating db...' persisted = SqliteDict(os.path.join(data_dir, DATA_FILE), autocommit=False) print 'Updating data...' persisted.update(zg.country_postal_codes) print 'Committing data...' persisted.commit()
def test_reopen_conn(self): """Verify using a contextmanager that a connection can be reopened.""" fname = norm_file('tests/db/sqlitedict-override-test.sqlite') db = SqliteDict(filename=fname) with db: db['key'] = 'value' db.commit() with db: db['key'] = 'value' db.commit()
def test_tablenames(self): fname = norm_file('tests/db/tablenames-test-1.sqlite') SqliteDict(fname) self.assertEqual(SqliteDict.get_tablenames(fname), ['unnamed']) fname = norm_file('tests/db/tablenames-test-2.sqlite') with SqliteDict(fname,tablename='table1') as db1: self.assertEqual(SqliteDict.get_tablenames(fname), ['table1']) with SqliteDict(fname,tablename='table2') as db2: self.assertEqual(SqliteDict.get_tablenames(fname), ['table1','table2']) tablenames = SqliteDict.get_tablenames('tests/db/tablenames-test-2.sqlite') self.assertEqual(tablenames, ['table1','table2'])
def basic_usage(): """SqliteDict引擎会将任意的value转换为 """ mydict = SqliteDict("test.sqlite", autocommit=True) mydict["integer_value"] = 1 mydict["real_value"] = 2.2 mydict["text_value"] = "abc" mydict["date_value"] = date.today() mydict["datetime_value"] = datetime.now() # if you don't use with SqliteDict("test.sqlite") as mydict: ... # you have to close the connection explicitly mydict.close()
def assertDatasetEquals(self, expected, tolerance): # Close the file to ensure it is written to disk. self.recorder.close() # self.recorder.out = None sentinel = object() db = SqliteDict( self.filename, self.tablename ) for coord, expect in expected: iter_coord = format_iteration_coordinate(coord) groupings = ( ("Parameters", expect[0]), ("Unknowns", expect[1]), ("Residuals", expect[2]) ) #### Need to get the record with the key of 'iter_coord' actual_group = db[iter_coord] timestamp = actual_group['timestamp'] self.assertTrue(self.t0 <= timestamp and timestamp <= self.t1 ) for label, values in groupings: actual = actual_group[label] # If len(actual) == len(expected) and actual <= expected, then # actual == expected. self.assertEqual(len(actual), len(values)) for key, val in values: found_val = actual.get(key, sentinel) if found_val is sentinel: self.fail("Did not find key '{0}'".format(key)) if isinstance(found_val, _ByObjWrapper): found_val = found_val.val try: assert_rel_error(self, found_val, val, tolerance) except TypeError as error: self.assertEqual(found_val, val) del db[iter_coord] ######## delete the record with the key 'iter_coord' # Having deleted all found values, the file should now be empty. ###### Need a way to get the number of records in the main table self.assertEqual(len(db), 0) db.close()
def __init__(self, out, **sqlite_dict_args): super(SqliteRecorder, self).__init__() if MPI and MPI.COMM_WORLD.rank > 0 : self._open_close_sqlitedict = False else: self._open_close_sqlitedict = True if self._open_close_sqlitedict: sqlite_dict_args.setdefault('autocommit', True) self.out = SqliteDict(filename=out, flag='n', tablename='openmdao', **sqlite_dict_args) self.out_derivs = SqliteDict(filename=out, flag='w', tablename='openmdao_derivs', **sqlite_dict_args) else: self.out = None
def assertDatasetEquals(self, expected, tolerance): # Close the file to ensure it is written to disk. self.recorder.close() # self.recorder.out = None sentinel = object() db = SqliteDict( self.filename, self.tablename ) ###### Need a way to get a list of the group_names in the order in which they were written and put it in a variable named order order = db['order'] del db['order'] for coord, expect in expected: iter_coord = format_iteration_coordinate(coord) self.assertEqual(order.pop(0), iter_coord) groupings = ( ("Parameters", expect[0]), ("Unknowns", expect[1]), ("Residuals", expect[2]) ) #### Need to get the record with the key of 'iter_coord' actual_group = db[iter_coord] for label, values in groupings: actual = actual_group[label] # If len(actual) == len(expected) and actual <= expected, then # actual == expected. self.assertEqual(len(actual), len(values)) for key, val in values: found_val = actual.get(key, sentinel) if found_val is sentinel: self.fail("Did not find key '{0}'".format(key)) assert_rel_error(self, found_val, val, tolerance) del db[iter_coord] ######## delete the record with the key 'iter_coord' # Having deleted all found values, the file should now be empty. ###### Need a way to get the number of records in the main table self.assertEqual(len(db), 0) # As should the ordering. self.assertEqual(len(order), 0) db.close()
def __init__(self, basename, use_locks=True): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # save the opened objects right back. this is not necessary and costs extra # time, but is cleaner when there are server location changes (see `check_moved`). self.flush(save_index=True, save_model=True, clear_buffer=True) logger.info("loaded %s" % self)
def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self)
def test_driver_records_model_viewer_data(self): size = 3 prob = Problem(Group(), impl=impl) G1 = prob.root.add('G1', ParallelGroup()) G1.add('P1', IndepVarComp('x', np.ones(size, float) * 1.0)) G1.add('P2', IndepVarComp('x', np.ones(size, float) * 2.0)) prob.root.add('C1', ABCDArrayComp(size)) prob.root.connect('G1.P1.x', 'C1.a') prob.root.connect('G1.P2.x', 'C1.b') prob.driver.add_recorder(self.recorder) self.recorder.options['record_metadata'] = True prob.setup(check=False) prob.cleanup() # do some basic tests to make sure the model_viewer_data was recorded correctly if self.comm.rank == 0: db = SqliteDict(self.filename, self.tablename_metadata) model_viewer_data = db['model_viewer_data'] tr = model_viewer_data['tree'] self.assertEqual(set(['name', 'type', 'subsystem_type', 'children']), set(tr.keys())) names = [] for ch1 in tr['children']: # each is an ordereddict names.append(ch1["name"] ) for ch2 in ch1["children"]: names.append(ch2["name"] ) if "children" in ch2: for ch3 in ch2["children"]: names.append(ch3["name"] ) expected_names = ['G1', 'P1', 'x', 'P2', 'x', 'C1', 'a', 'b', 'in_string', 'in_list', 'c', 'd', 'out_string', 'out_list'] self.assertEqual( sorted(expected_names), sorted(names)) cl = model_viewer_data['connections_list'] for c in cl: self.assertEqual(set(['src', 'tgt']), set(c.keys())) db.close()
def test_recording_system_metadata(self): prob = Problem() prob.root = ConvergeDiverge() prob.root.add_metadata("string", "just a test") prob.root.add_metadata("ints", [1, 2, 3]) prob.driver.add_recorder(self.recorder) self.recorder.options["record_metadata"] = True prob.setup(check=False) prob.cleanup() # closes recorders # check the system metadata recording sqlite_metadata = SqliteDict(filename=self.filename, flag="r", tablename="metadata") system_metadata = sqlite_metadata["system_metadata"] self.assertEqual(len(system_metadata), 2) self.assertEqual(system_metadata["string"], "just a test") self.assertEqual(system_metadata["ints"], [1, 2, 3]) sqlite_metadata.close()
def test_recording_model_viewer_data(self): prob = Problem() prob.root = ConvergeDiverge() prob.driver.add_recorder(self.recorder) self.recorder.options["record_metadata"] = True prob.setup(check=False) prob.cleanup() # closes recorders # do some basic tests to make sure the model_viewer_data was recorded db = SqliteDict(filename=self.filename, flag="r", tablename="metadata") model_viewer_data = db["model_viewer_data"] tr = model_viewer_data["tree"] self.assertEqual(set(["name", "type", "subsystem_type", "children"]), set(tr.keys())) cl = model_viewer_data["connections_list"] for c in cl: self.assertEqual(set(["src", "tgt"]), set(c.keys())) db.close()
def test_1_theoretical_ion_space_step(self): print("test_1_theoretical_ion_space_step") ms_digest = MSDigestParameters.parse(self.protein_prospector_file) theo_ions = entry_point.generate_theoretical_ion_space( self.ms1_matching_output_file, self.glycosylation_sites_file, ms_digest.constant_modifications, ms_digest.variable_modifications, ms_digest.enzyme, self.num_procs) self.assertTrue(os.path.exists(theo_ions)) self.theoretical_ion_space_file = theo_ions theoretical_ions = SqliteDict(theo_ions, tablename="theoretical_search_space") sequence_set = theoretical_ions.itervalues() peptide_sequences = [ sequence.Sequence(s["Seq_with_mod"]) for s in sequence_set] peptide_mods = set() for seq in peptide_sequences: for resid, mod in seq: peptide_mods.update((m.rule for m in mod)) print(peptide_mods)
class SqliteDictJsonSerializationTest(unittest.TestCase): def setUp(self): self.fname = norm_file('tests/db-json/sqlitedict.sqlite') self.db = SqliteDict( filename=self.fname, tablename='test', encode=json.dumps, decode=json.loads ) def tearDown(self): self.db.close() os.unlink(self.fname) os.rmdir(os.path.dirname(self.fname)) def get_json(self, key): return self.db.conn.select_one('SELECT value FROM test WHERE key = ?', (key,))[0] def test_int(self): self.db['test'] = -42 assert self.db['test'] == -42 assert self.get_json('test') == '-42' def test_str(self): test_str = u'Test \u30c6\u30b9\u30c8' self.db['test'] = test_str assert self.db['test'] == test_str assert self.get_json('test') == r'"Test \u30c6\u30b9\u30c8"' def test_bool(self): self.db['test'] = False assert self.db['test'] is False assert self.get_json('test') == 'false' def test_none(self): self.db['test'] = None assert self.db['test'] is None assert self.get_json('test') == 'null' def test_complex_struct(self): test_value = { 'version': 2.5, 'items': ['one', 'two'], } self.db['test'] = test_value assert self.db['test'] == test_value assert self.get_json('test') == json.dumps(test_value)
class ModelCacheStoreSqlite(ModelCacheStore): """ BTree查找实现 """ def __init__(self, name): from sqlitedict import SqliteDict self.datadict = SqliteDict(name) def sync(self): return self.datadict.commit() # instead of #sync
def __init__(self, config): CacheInterface.__init__(self, config) self.db = SqliteDict(self.config[u"cache"][u"file"], autocommit=True) self.expiration = self.config[u"cache"].get(u"expiration", 86400) def closer(): try: self.db.close() except Exception: logger.exception("Exception closing file cache") atexit.register(closer)
class Scribe(object): def __init__(self, location, table_name, exp_name): filename = "{}/scribe.sqlite".format(location) self.book = SqliteDict(filename, autocommit=True, tablename=table_name) unique_id = datetime.now().strftime("date_%m.%d_time_%H.%M") self.exp_name = exp_name+"_"+unique_id self.observation_index = 0 def record(self, value, type="general"): key = "{}; {}; {}".format(self.exp_name, self.observation_index, type) self.book[key] = value self.observation_index += 1 observe = record #sometimes i forget which def lookup(self, type=None, exp_name=None, ret_sorted=False, strip_keys=False): type_func = lambda *args: True name_func = lambda *args: True if type: type_func = lambda x: x[2] == type if exp_name: name_func = lambda x: exp_name in x[0] key_func = lambda x: type_func(x) and name_func(x) unpack = lambda x: [f(x.strip()) for f,x in zip([str,int,str],x.split(";"))] items = {k:v for k,v in self.book.iteritems() if key_func(unpack(k))} if ret_sorted: return self.sort_results(items, strip_keys) return items def sort_results(self, result_dict, only_val_return=False): unpack = lambda x: [f(x.strip()) for f,x in zip([str,int,str],x.split(";"))] ranker = lambda x: unpack(x[0])[1] sorted_items = sorted(result_dict.items(), key=ranker) if only_val_return: return [v for k,v in sorted_items] return sorted_items def close(self): self.book.close()
def test_as_str(self): """Verify SqliteDict.__str__().""" # given, db = SqliteDict() # exercise db.__str__() # test when db closed db.close() db.__str__()
class FileCache(CacheInterface): def __init__(self, config): CacheInterface.__init__(self, config) self.db = SqliteDict(self.config[u"cache"][u"file"], autocommit=True) self.expiration = self.config[u"cache"].get(u"expiration", 86400) def closer(): try: self.db.close() except Exception: logger.exception("Exception closing file cache") atexit.register(closer) def get(self, key): if int(self.db[key + "_expiration"]) - time.time() <= 0: raise KeyError("cache key expired") return self.db[key] def set(self, key, val): self.db[key] = val self.db[key + "_expiration"] = str(int(time.time()) + self.expiration)
def __init__(self, out, **sqlite_dict_args): super(SqliteRecorder, self).__init__() self.model_viewer_data = None if MPI and MPI.COMM_WORLD.rank > 0 : self._open_close_sqlitedict = False else: self._open_close_sqlitedict = True if self._open_close_sqlitedict: sqlite_dict_args.setdefault('autocommit', True) self.out_metadata = SqliteDict(filename=out, flag='n', tablename='metadata', **sqlite_dict_args) self.out_metadata['format_version'] = format_version self.out_iterations = SqliteDict(filename=out, flag='w', tablename='iterations', **sqlite_dict_args) self.out_derivs = SqliteDict(filename=out, flag='w', tablename='derivs', **sqlite_dict_args) else: self.out_metadata = None self.out_iterations = None self.out_derivs = None
def test_default_reuse_existing_flag_c(self): """Re-opening of a database does not destroy it.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname) orig_db['key'] = 'value' orig_db.commit() orig_db.close() next_db = SqliteDict(filename=fname) self.assertIn('key', next_db.keys()) self.assertEqual(next_db['key'], 'value')
def test_overwrite_using_flag_n(self): """Re-opening of a database with flag='c' destroys it all.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname, tablename='sometable') orig_db['key'] = 'value' orig_db.commit() orig_db.close() # verify, next_db = SqliteDict(filename=fname, tablename='sometable', flag='n') self.assertNotIn('key', next_db.keys())
def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = {} # map document id (string) to index position (integer) self.pos2id = {} # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict(self.fname + '.id2sims', journal_mode=JOURNAL_MODE) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0
def build_with_bad_flag(): fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db = SqliteDict(filename=fname, flag = 'FOO')
def run(config, db_name): with SqliteDict(db_name, tablename='tagged_data') as db: data = list(db.items()) with SqliteDict(db_name, tablename='linearized_data') as db: finished = set(db.keys()) data = [datum for datum in data if datum[0] not in finished] model = language_model.from_config(config) _safety = 2**32 beam = linearizer.decode.beam decode_one = linearizer.utils.decode_one gendist = linearizer.utils.gendist PartialTree = linearizer.partialtree.PartialTree editdist = editdistance.eval results = {} bar = tqdm(total=len(data), desc='partial to trellis to decoding') for idx, (datum, datum_str, _) in data: partial = PartialTree.from_list(datum) partial.prep() difficulty = partial.measure_difficulty() model.logger.debug(str(idx) + ' Difficulty: ' + str(difficulty)) if difficulty > 2**35: bad_str = "Skipping. index={}; difficulty={}".format( idx, difficulty) model.logger.debug(bad_str) bar.update(1) continue seqs, memos = linearizer.dp.run(partial) if len(seqs) == 0: bad_str = "Failure. index={}; difficulty={}".format( idx, difficulty) model.logger.debug(bad_str) bar.update(1) continue datumstr_as_list = datum_str.split(" ") datum_len = len(datumstr_as_list) beam_state, step_decisions, best_idx = beam(memos, model) genscores = {} edscores = {} saving_state = { 'datum': datum_str, 'beam_state': beam_state, 'difficulty': difficulty, 'generation_distance': [], 'edit_distance': [], 'beam_solutions': [], 'beam_scores': [] } seen = set() for score, beam_idx in beam_state: sentence = decode_one(memos, beam_idx) assert beam_idx not in seen seen.add(beam_idx) gval = gendist(datumstr_as_list, sentence) edval = editdist(datumstr_as_list, sentence) saving_state['generation_distance'].append(gval) saving_state['edit_distance'].append(edval) saving_state['beam_solutions'].append(sentence) saving_state['beam_scores'].append(score) results[idx] = saving_state bar.update(1) if len(results) > 10: with SqliteDict(db_name, tablename='linearized_data') as db: db.update(results) db.commit() results = {} if len(results) > 0: with SqliteDict(db_name, tablename='linearized_data') as db: db.update(results) db.commit() results = {} print("Finished.")
class SignupPlugin(Plugin): def load(self, ctx): self.guild_configs = SqliteDict('./guild_configs.sqlite', autocommit=True) self.signups = SqliteDict('./signups.sqlite', autocommit=True) def unload(self, ctx): self.guild_configs.close() self.signups.close() @Plugin.command( "config", "<admin_channel_id:snowflake> <signup_channel_id:snowflake> <announce_channel_id:snowflake>", ) def on_config(self, event, admin_channel_id, signup_channel_id, announce_channel_id): guild_id = str(event.msg.guild.id) self.guild_configs[guild_id] = { "admin_channel_id": admin_channel_id, "signup_channel_id": signup_channel_id, "announce_channel_id": announce_channel_id, } event.msg.reply("Succesfully configured this Discord for use!") @Plugin.command( "create", "<name:str> <tanks:int> <healers:int> <dps:int> <message:str...>") def on_create(self, event, name, tanks, healers, dps, message): guild_id = str(event.msg.guild.id) config = self.guild_configs[guild_id] if config is None: event.msg.reply( "I'm not configured! Please set up your channels first.") confirm_message = event.msg.reply( "You're creating an event named {} that requires {} tanks, {} healers, and {} dps. Your custom message is \n\n{}\n\nReact to confirm." .format(name, tanks, healers, dps, message)) confirm_message.add_reaction("greentick:612799716161486888") self.signups[guild_id] = { str(confirm_message.id): { "name": name, "message": message, "tanks": tanks, "healers": healers, "dps": dps, "confirmed": False, "announced": False, } } print self.signups @Plugin.listen("MessageReactionAdd") def on_message_reaction_add(self, event): # Not the bot if event.user_id == 612451478485073925: return message_id = str(event.message_id) guild_id = str(event.guild.id) admin_channel_id = self.guild_configs[guild_id]["admin_channel_id"] admin_channel = self.client.api.channels_get(admin_channel_id) if self.signups[guild_id][message_id] is None: return if self.guild_configs[guild_id] is None: return # Green check emoji, not bot id if event.emoji.id == 612799716161486888: confirm_event(self, guild_id, admin_channel, message_id) # Cheer emoji, not bot id if event.emoji.id == 612778926640726024 and self.signups[guild_id][ message_id]["confirmed"] is True: print 'here' announce_event(self, guild_id, admin_channel, message_id)
class Baidu_ordinary_windows(object): """百度普通收录窗体""" def __init__(self, tree, site, token): # 展示等待窗体 self.newroot = tk.Toplevel() self.newroot.title('普通收录') self.newroot.iconbitmap("favicon.ico") win_width = self.newroot.winfo_screenwidth() win_higth = self.newroot.winfo_screenheight() width_adjust = (win_width - 800) / 2 higth_adjust = (win_higth - 250) / 2 self.newroot.geometry("%dx%d+%d+%d" % (800, 250, width_adjust, higth_adjust)) # 提示内容 self.content = tk.Label(self.newroot, text="正在普通收录中,请不要中断操作,请耐心等待......") self.content.place( x=10, y=30, ) self.content2 = tk.Label(self.newroot, text="") self.content2.place( x=10, y=60, ) # 窗体日志 self.ttlog = ttlog(master=self.newroot) self.ttlog.place(x=10, y=70, width=780, height=150) self.tree = tree self.site = site self.token = token self.mydict = SqliteDict('./my_db.sqlite', autocommit=True) # 开始处理线程 self.p = Thread(target=self.main) self.p.setDaemon(True) self.p.start() self.ttlog.log("普通收录-->开启普通收录线程.....") # 点击关闭右上角 self.newroot.protocol("WM_DELETE_WINDOW", self.close) def close(self): self.ttlog.stop_log() self.newroot.destroy() # 获取未提交的urls def get_url(self): url_list = [] for key, value in sorted(self.mydict.iteritems()): if value[1] == "未提交": url_list.append(value) self.ttlog.log("普通收录-->共有没推送的网页链接数 :{} 条!".format(len(url_list))) print("共有没普通收录推送的网页链接数 :{} 条!".format(len(url_list))) return url_list # 查询剩余次数 def get_remain(self): post_url = "http://data.zz.baidu.com/urls?site={}&token={}".format( self.site, self.token) headers = { 'User-Agent': 'curl/7.12.1', 'Host': 'data.zz.baidu.com', 'Content-Type': 'text/plain', 'Content-Length': '83', } response = requests.post(post_url, headers=headers, data=self.site) req = response.text if "success" in req: req_json = json.loads(req) if req_json["remain"] == 0: self.ttlog.log( "普通收录-->查询剩余次数,今天普通收录推送任务已经完成,\n当天剩余的可推送url条数: " + req_json["remain"] + "条。") else: self.ttlog.log( "普通收录-->查询剩余次数,推送成功:" + self.site + '\n当天剩余的可推送url条数: {}条'.format(req_json["remain"])) return req_json["remain"] else: return 0 # 提交urls def api(self, url): post_url = "http://data.zz.baidu.com/urls?site={}&token={}".format( self.site, self.token) headers = { 'User-Agent': 'curl/7.12.1', 'Host': 'data.zz.baidu.com', 'Content-Type': 'text/plain', 'Content-Length': '83', } response = requests.post(post_url, headers=headers, data=url[0]) req = response.text if "success" in req: req_json = json.loads(req) if req_json["remain"] == 0: self.ttlog.log("普通收录-->今天普通收录推送任务已经完成,当天剩余的可推送url条数: 0条。") else: # 是否修改列表,看是否加载列表 tree_len = len(self.tree.get_children()) if tree_len != 0: print("普通收录-->修改列表") self.tree.item(url[4], value=(url[0], "已提交", url[2], url[3])) # 修改数据库 self.mydict[url[0]] = [url[0], "已提交", url[2], url[3], url[4]] self.ttlog.log( "普通收录-->推送成功:" + url[0] + '\n当天剩余的可推送url条数: {}条'.format(req_json["remain"])) else: req_json = json.loads(req) self.ttlog.log(r"普通收录-->推送失败:" + req_json["message"] + ",当天可剩余推送数量为0条。") return None # 处理函数 def main(self): # 获取未提交的urls urls = self.get_url() # 查询剩余次数 num = self.get_remain() # 确定执行的urls post_urls = urls[:num] # 是否开始处理 flag = tk.messagebox.askquestion( "提交", "本地共有没推送的网页数 :{} 条!\n" "当前剩余主动推送的次数 :{} 条!\n" "选“是”开始提交,选“否”取消提交".format(len(urls), num)) if flag == "yes": try: # 窗体置顶 self.newroot.wm_attributes('-topmost', 1) cpu_num = multiprocessing.cpu_count() self.ttlog.log("CPU核心数:" + str(cpu_num)) self.ttlog.log("开启线程池,能一定程度加速") pool = ThreadPool(cpu_num) results = pool.map(self.api, post_urls) pool.close() pool.join() self.ttlog.stop_log() self.ttlog.log("普通收录-->今日的推送任务完成!") self.content.config(text="普通收录-->今日的推送任务完成!") except Exception as e: self.ttlog.log('错误代码:{}'.format(e)) self.ttlog.log("Error: unable to start thread") else: self.ttlog.log("你选择了否,没有推送网页链接")
class Update_window(object): """sitemap更新窗体""" def __init__(self, tree, eblog, sitemap): self.newroot = tk.Toplevel() self.newroot.title('下载文件中') self.newroot.iconbitmap("favicon.ico") self.newroot.wm_attributes('-topmost', 1) win_width = self.newroot.winfo_screenwidth() win_higth = self.newroot.winfo_screenheight() width_adjust = (win_width - 800) / 2 higth_adjust = (win_higth - 250) / 2 self.newroot.geometry("%dx%d+%d+%d" % (800, 250, width_adjust, higth_adjust)) # 进度条 self.bar = ttk.Progressbar(self.newroot, length=740, mode="indeterminate", orient=tk.HORIZONTAL) self.bar.place( x=30, y=150, ) self.bar.start(10) # 提示内容 self.content = tk.Label(self.newroot, text="正在下载Sitemap.xml文件...") self.content.place( x=30, y=30, ) self.content2 = tk.Label(self.newroot, text="下载速度和文件大小以及服务器带宽有关,请耐心等待......", wraplength=740, justify="left") self.content2.place( x=30, y=60, ) self.eblog = eblog self.sitemap = sitemap self.tree = tree self.mydict = SqliteDict('./my_db.sqlite', autocommit=True) # 开启处理线程 self.p = Thread(target=self.update) self.p.setDaemon(True) self.p.start() self.eblog.log("Sitemap线程:开启sitemap线程,下载Sitemap.xml中...") # 关闭右上角 self.newroot.protocol("WM_DELETE_WINDOW", self.close) # 列表添加item,返回iid def append_item(self, item_list): # 加最后/加前面都可以,因为要是前面iid全是1 item = self.tree.insert("", 0, values=(item_list[0], item_list[1], item_list[2], item_list[3])) return item # 处理函数 def update(self): try: with open("sitemap.xml", "wb") as f: f.write(requests.get(self.sitemap).content) with open("sitemap.xml", 'r', encoding='utf-8') as f: xml_data = f.read() self.content.configure(text="Sitemap文件下载完成,正在对比分析....") urls = re.findall(r'<loc>(.+?)</loc>', xml_data, re.S) self.eblog.log("Sitemap线程-->下载Sitemap.xml完成,正在解析xml文件...") tuple_list = list(self.mydict.iteritems()) tree_urls = [i[0] for i in tuple_list] # 求交集 c = list(set(urls).intersection(set(tree_urls))) # tree中多余的 tree_urls_ = list(set(tree_urls).difference(set(c))) # 交集不动,tree把tree中把不是交集的删掉,把siemaop中不是交集的增添上. for key, value in sorted(tuple_list): self.content2.config(text="当前处理-->检查" + key) if key not in c: # 是否删除,看有没有列表 if len(self.tree.get_children()) != 0: self.tree.delete(value[4]) self.mydict.__delitem__(key=key) self.eblog.log("Sitemap线程-->本地删除" + str(tree_urls_)) # sitemap中新添加的 urls_ = list(set(urls).difference(set(c))) for url in sorted(urls_): cur_time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') iid = self.append_item([url, "未提交", "未提交", cur_time]) self.mydict[url] = [url, "未提交", "未提交", cur_time, iid] self.content2.config(text="当前处理-->正在添加" + url) self.eblog.log("Sitemap线程-->本地添加" + str(urls_)) self.eblog.log("Sitemap线程-->关闭sitemap线程,更新完成。") self.close() except: self.eblog.log(traceback.format_exc()) self.eblog.log("Sitemap线程-->更新失败") def close(self): self.newroot.destroy()
class SimIndex(gensim.utils.SaveLoad): """ An index of documents. Used internally by SimServer. It uses the Similarity class to persist all document vectors to disk (via mmap). """ def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = { } # map document id (string) to index position (integer) self.pos2id = { } # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict( self.fname + '.id2sims', journal_mode=JOURNAL_MODE ) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0 def save(self, fname): tmp, self.id2sims = self.id2sims, None super(SimIndex, self).save(fname) self.id2sims = tmp @staticmethod def load(fname): result = gensim.utils.SaveLoad.load(fname) result.check_moved(fname) result.id2sims = SqliteDict(result.fname + '.id2sims', journal_mode=JOURNAL_MODE) return result def check_moved(self, fname): # Add extra logic to loading: if the location on filesystem changed, # update locations of all shard files. # The other option was making shard locations relative to a directory name. # That way we wouldn't have to update their locations on load, but on the # other hand we'd have to pass a dirname to each call that needs their # absolute location... annoying. if self.fname != fname: logger.info( "index seems to have moved from %s to %s; updating locations" % (self.fname, fname)) self.fname = fname output_prefix = fname + '.idx' for shard in self.qindex.shards: shard.fname = shard.fname.replace(self.qindex.output_prefix, output_prefix, 1) self.qindex.output_prefix = output_prefix def close(self): "Explicitly release important resources (file handles, db, ...)" try: self.id2sims.close() except: pass try: del self.qindex except: pass def terminate(self): """Delete all files created by this index, invalidating `self`. Use with care.""" try: self.id2sims.terminate() except: pass import glob for fname in glob.glob(self.fname + '*'): try: os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s: %s" % (fname, e)) for val in self.__dict__.keys(): try: delattr(self, val) except: pass
def __init__(self, name: str, author: str, table: str): # DB stores values directly (not encoded as a pickle) self.sqlite_dict = SqliteDict(get_sqlite_path(name, author), table, encode=lambda x: x, decode=lambda x: x)
from flask import Flask, jsonify, make_response, abort, redirect, url_for, render_template from flask.ext.httpauth import HTTPBasicAuth from flask.ext.sqlalchemy import SQLAlchemy from sqlitedict import SqliteDict auth = HTTPBasicAuth() app = Flask(__name__) app.config['SECRET_KEY'] = 'the secret key' app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///db.sqlite' flixrdb = SqliteDict('./movies.sqlite', autocommit=True) db = SQLAlchemy(app) class Movies(db.Model): __tablename__ = 'movies' id = db.Column(db.Integer, primary_key=True) social_id = db.Column(db.String(64), nullable=False, unique=True) nickname = db.Column(db.String(64), nullable=False) email = db.Column(db.String(64), nullable=True) @app.route('/api/v1.0/movies/<int:movie_id>', methods=['GET', 'POST']) @auth.login_required def get_movie(movie_id): try: return jsonify({'movie': flixrdb[movie_id]}) except: return make_response(jsonify({'error': 'Movie not found'}), 404)
def OptimizationHistory(self): """ Reads in database history file and stores contents. Function information is stored as a dict in func_data, variable information is stored as a dict in var_data, and bounds information is stored as a dict in bounds. """ # Initialize dictionaries for design variables and unknowns. # The data is saved redundantly in dicts for all iterations and then # for major iterations as well. self.func_data_all = {} self.func_data_major = {} self.var_data_all = {} self.var_data_major = {} db = {} self.num_iter = 0 # Loop over each history file name provided by the user. for histIndex, histFileName in enumerate(self.histList): # If they only have one history file, we don't change the keys' names if len(self.histList) == 1: histIndex = "" else: # If multiple history files, append letters to the keys, # such that 'key' becomes 'key_A', 'key_B', etc histIndex = "_" + chr(histIndex + ord("A")) self.histIndex = histIndex try: # This is the classic method of storing history files db = shelve.open(histFileName, "r") OpenMDAO = False except: # Bare except because error is not in standard Python. # noqa: E722 # If the db has the 'iterations' tag, it's an OpenMDAO db. db = SqliteDict(histFileName, "iterations") OpenMDAO = True # Need to do this since in py3 db.keys() is a generator object keys = [i for i in db.keys()] # If it has no 'iterations' tag, it's a pyOptSparse db. if keys == []: OpenMDAO = False db = SqliteDict(histFileName) # Specific instructions for OpenMDAO databases if OpenMDAO: # Get the number of iterations by looking at the largest number # in the split string names for each entry in the db if major_python_version == 3: for string in db.keys(): string = string.split("|") else: string = db.keys()[-1].split("|") nkey = int(string[-1]) self.solver_name = string[0] # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Get the keys of the database where derivatives were evaluated. # These correspond to major iterations, while no derivative # info is calculated for gradient-free linesearches. deriv_keys = SqliteDict(histFileName, "derivs").keys() self.deriv_keys = [ int(key.split("|")[-1]) for key in deriv_keys ] # Save information from the history file for the funcs. self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO) # Save information from the history file for the unknowns. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str="Unknowns") # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str="Parameters") # Add labels to OpenMDAO variables. # Corresponds to constraints, design variables, and objective. try: db = SqliteDict(histFileName, "metadata") self.SaveOpenMDAOData(db) except KeyError: # Skip metadata info if not included in OpenMDAO hist file pass else: # Get the number of iterations nkey = int(db["last"]) + 1 self.nkey = nkey # Initalize a list detailing if the iterations are major or minor self.iter_type = np.zeros(nkey) # Check to see if there is bounds information in the db file. # If so, add them to self.bounds to plot later. try: info_dict = db["varInfo"].copy() info_dict.update(db["conInfo"]) # Got to be a little tricky here since we're modifying # info_dict; if we simply loop over it with the generator # from Python3, it will contain the new keys and then the # names will be mangled incorrectly. bounds_dict = {} scaling_dict = {} for key in info_dict.keys(): bounds_dict[key + histIndex] = { "lower": info_dict[key]["lower"], "upper": info_dict[key]["upper"], } scaling_dict[key + histIndex] = info_dict[key]["scale"] self.bounds.update(bounds_dict) self.scaling.update(scaling_dict) except KeyError: pass # Check to see if there is proper saved info about iter type if "iu0" in db["0"].keys(): if db[db["last"]]["iu0"] > 0: self.storedIters = True else: self.storedIters = False else: self.storedIters = False # Save information from the history file for the funcs. self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO) # Save information from the history file for the funcs. self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str="funcs") # Save information from the history file for the design variables. self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str="xuser") # Set the initial dictionaries to reference all iterations. # Later this can be set to reference only the major iterations. self.func_data = self.func_data_all self.var_data = self.var_data_all # Find the maximum length of any variable in the dictionaries and # save this as the number of iterations. for data_dict in [self.func_data, self.var_data]: for key in data_dict.keys(): length = len(data_dict[key]) if length > self.num_iter: self.num_iter = length
limiter = Limiter(key_func=get_remote_address) cache_supported_backends = { None: __cache_module.NullCache, 'memcached': __cache_module.MemcachedCache, 'redis': __cache_module.RedisCache } __cache_uri = os.environ.get('CACHE_SERVICE') if __cache_uri: try: # example __cache_uri is 'redis:dev_redis_1:6379' [__cache_type, __url, __port] = __cache_uri.split(':') except ValueError: raise ImproperlyConfigured('CACHE_SERVICE is wrongly formatted. Use "redis:dev_redis_1:6379" as example.') if __cache_type == 'redis': cache = __cache_module.RedisCache(host=__url, port=__port, default_timeout=os.environ.get('CACHE_TIMEOUT')) elif __cache_type == 'memcached': cache = __cache_module.MemcachedCache( servers=["{url}:{port}".format(url=__url, port=__port)], default_timeout=os.environ.get('CACHE_TIMEOUT') ) else: raise ImproperlyConfigured('Unknown cache service, only Memcached and Redis are supported at the moment.') else: cache = __cache_module.NullCache credentials_store = SqliteDict('flask_oidc.db', autocommit=True) openid_connect = OpenIDConnect(credentials_store=credentials_store)
def grep_in_blogtree(blogtree_path, username, registered, string, case_sensitive): with SqliteDict(blogtree_path) as serialized: for blog in tqdm(serialized.itervalues(), total=len(serialized)): grep_in_blog(blog, username, registered, string, case_sensitive)
class BackendDbHandler(object): """Table structure target_pages: A table to save URL where folklore is. Key-value pair. {url_string: TargetPage object} target_html: A table to save HTML of folklore. Key-value pair. {url_string: ExtractedPage object} """ def __init__(self, path_db_file: str, interval: int = 3): self.db_target_pages = SqliteDict(path_db_file, autocommit=True, tablename='target_pages', encode=json.dumps, decode=json.loads) self.db_html = SqliteDict(path_db_file, autocommit=True, tablename='target_html', encode=json.dumps, decode=json.loads) self.interval = interval def save_target_urls(self, target_urls: List[str]): """Save target URL into DB.""" for url in target_urls: if url not in self.db_target_pages: data, errs = TargetPage(strict=True).load({ 'page_url': url, 'status': False, 'note': '', 'extracted_at': '' }) self.db_target_pages[url] = data else: logger.info('URL={} is already in target. Skip.'.format(url)) else: self.db_target_pages.commit() def run_html_extraction(self, is_force_retry: bool = False, limit: int = -1): """Gets all target page and save them into DB.""" default_i = 0 for url, page_obj in tqdm(list(self.db_target_pages.items())): _obj = TargetPage(strict=True).load(page_obj) if page_obj['status'] is False or is_force_retry is True: try: html_doc = requests.get(url).text error_msg = '' status = True except ExtractedPage as e: html_doc = '' error_msg = e.__str__() status = False data, errs = ExtractedPage(strict=True).load({ 'page_url': url, 'status': status, 'html_document': html_doc, 'note': error_msg, 'extracted_at': datetime.now().__str__() }) page_obj['status'] = True page_obj['extracted_at'] = datetime.now().__str__() self.db_target_pages[url] = data default_i += 1 time.sleep(self.interval) if default_i == limit: logger.info('Terminated by limit={}'.format(limit)) break else: logger.info('URL={} is already in target. Skip.'.format(url)) else: self.db_target_pages.commit() self.db_html.commit() def show_extracted_html(self) -> List[Dict[str, Any]]: __ = [] for url, obj_ in self.db_target_pages.items(): data, errs = ExtractedPage(strict=True).load(obj_) if data['status']: __.append(obj_) else: return __
def test_overwrite_using_flag_w(self): """Re-opening of a database with flag='w' destroys only the target table.""" # given, fname = norm_file('tests/db/sqlitedict-override-test.sqlite') orig_db_1 = SqliteDict(filename=fname, tablename='one') orig_db_1['key'] = 'value' orig_db_1.commit() orig_db_1.close() orig_db_2 = SqliteDict(filename=fname, tablename='two') orig_db_2['key'] = 'value' orig_db_2.commit() orig_db_2.close() # verify, when re-opening table space 'one' with flag='2', we destroy # its contents. However, when re-opening table space 'two' with # default flag='r', its contents remain. next_db_1 = SqliteDict(filename=fname, tablename='one', flag='w') self.assertNotIn('key', next_db_1.keys()) next_db_2 = SqliteDict(filename=fname, tablename='two') self.assertIn('key', next_db_2.keys())
def test_irregular_tablenames(self): """Irregular table names need to be quoted""" db = SqliteDict(':memory:', tablename='9nine') db['key'] = 'value' db.commit() self.assertEqual(db['key'], 'value') db.close() db = SqliteDict(':memory:', tablename='outer space') db['key'] = 'value' db.commit() self.assertEqual(db['key'], 'value') db.close() with self.assertRaisesRegexp(ValueError, r'^Invalid tablename '): SqliteDict(':memory:', '"')
def __init__(self, lm, cache_db): self.lm = lm self.cache_db = cache_db os.makedirs(os.path.dirname(cache_db), exist_ok=True) self.dbdict = SqliteDict(cache_db, autocommit=True)
def get_db(): return SqliteDict('db/mails_db.db', "mails", autocommit=True)
def load(fname): result = gensim.utils.SaveLoad.load(fname) result.check_moved(fname) result.id2sims = SqliteDict(result.fname + '.id2sims', journal_mode=JOURNAL_MODE) return result
def __init__(self, path): self.sd = SqliteDict(path, autocommit=True)
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=True): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.lock_update = threading.RLock( ) if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # save the opened objects right back. this is not necessary and costs extra # time, but is cleaner when there are server location changes (see `check_moved`). self.flush(save_index=True, save_model=True, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate( ) # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict( journal_mode=JOURNAL_MODE ) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning( "too few training documents; using simple log-entropy model instead of latent semantic indexing" ) method = 'logentropy' else: method = 'lsi' self.model = SimModel(self.fresh_docs, method=method) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # TODO HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: os.remove(fname) logger.info("deleted %s" % fname) except Exception, e: logger.warning("failed to delete %s" % fname) self.model = None self.flush(save_index=True, save_model=True, clear_buffer=True)
class ToolDocumentCache: def __init__(self, cache_dir): self.cache_dir = cache_dir if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) self.cache_file = os.path.join(self.cache_dir, 'cache.sqlite') self.writeable_cache_file = None self._cache = None self.disabled = False self._get_cache(create_if_necessary=True) def close(self): self._cache and self._cache.close() def _get_cache(self, flag='r', create_if_necessary=False): try: if create_if_necessary and not os.path.exists(self.cache_file): # Create database if necessary using 'c' flag self._cache = SqliteDict(self.cache_file, flag='c', encode=encoder, decode=decoder, autocommit=False) if flag == 'r': self._cache.flag = flag else: cache_file = self.writeable_cache_file.name if self.writeable_cache_file else self.cache_file self._cache = SqliteDict(cache_file, flag=flag, encode=encoder, decode=decoder, autocommit=False) except sqlite3.OperationalError: log.warning('Tool document cache unavailable') self._cache = None self.disabled = True @property def cache_file_is_writeable(self): return os.access(self.cache_file, os.W_OK) def reopen_ro(self): self._get_cache(flag='r') self.writeable_cache_file = None def get(self, config_file): try: tool_document = self._cache.get(config_file) except sqlite3.OperationalError: log.debug("Tool document cache unavailable") return None if not tool_document: return None if tool_document.get( 'tool_cache_version') != CURRENT_TOOL_CACHE_VERSION: return None if self.cache_file_is_writeable: for path, modtime in tool_document['paths_and_modtimes'].items(): if os.path.getmtime(path) != modtime: return None return tool_document def _make_writable(self): if not self.writeable_cache_file: self.writeable_cache_file = tempfile.NamedTemporaryFile( dir=self.cache_dir, suffix='cache.sqlite.tmp', delete=False) if os.path.exists(self.cache_file): shutil.copy(self.cache_file, self.writeable_cache_file.name) self._get_cache(flag='c') def persist(self): if self.writeable_cache_file: self._cache.commit() os.rename(self.writeable_cache_file.name, self.cache_file) self.reopen_ro() def set(self, config_file, tool_source): try: if self.cache_file_is_writeable: self._make_writable() to_persist = { 'document': tool_source.to_string(), 'macro_paths': tool_source.macro_paths, 'paths_and_modtimes': tool_source.paths_and_modtimes(), 'tool_cache_version': CURRENT_TOOL_CACHE_VERSION, } try: self._cache[config_file] = to_persist except RuntimeError: log.debug("Tool document cache not writeable") except sqlite3.OperationalError: log.debug("Tool document cache unavailable") def delete(self, config_file): if self.cache_file_is_writeable: self._make_writable() try: del self._cache[config_file] except (KeyError, RuntimeError): pass def __del__(self): if self.writeable_cache_file: try: os.unlink(self.writeable_cache_file.name) except Exception: pass
class Db: """The data handling object for pgpgram. Args: verbose (int): level of """ config_path = config.get_config_dir() data_path = config.get_data_dir() cache_path = config.get_cache_dir() executable_path = dirname(realpath(__file__)) files_db_path = path_join(config.get_config_dir(), "files.db") names_db_path = path_join(config.get_config_dir(), "names.db") def __init__(self, verbose=0): self.verbose = verbose if exists(self.files_db_path): self.files = SqliteDict(self.files_db_path, autocommit=False) else: self.files = SqliteDict(self.files_db_path, autocommit=False) self.from_pickle_to_db() if exists(self.names_db_path): self.file_names = SqliteDict(self.names_db_path, autocommit=False) else: self.rebuild_names_db() # Load configuration from disk into 'config' attribute try: self.config = load(path_join(self.config_path, "config.pkl")) except FileNotFoundError as e: # Init configuration if verbose > 0: pprint("Config file not found in path, initializing") self.config = {"db key":random_id(20)} # Paths index_dir = path_join(self.data_path, "index") tdlib_dir = path_join(self.data_path, 'tdlib') tdlib_config_symlink = path_join(self.config_path, "tdlib") tdlib_documents_dir = path_join(self.cache_path, "documents") tdlib_documents_symlink = path_join(tdlib_dir, "documents") # Init paths if not exists(index_dir): mkdir(index_dir) if not exists(tdlib_dir): mkdir(tdlib_dir) mkdir(tdlib_documents_dir) symlink(tdlib_dir, tdlib_config_symlink) symlink(tdlib_documents_dir, tdlib_documents_symlink) # Load index # try: # self.index = load(path_join(self.data_path, "index.pkl")) # except: # if verbose > 0: # print("index still not built") self.save() def from_pickle_to_db(self): files_pickle_path = path_join(self.config_path, "files.pkl") if exists(files_pickle_path): if verbose: print("converting files pickle to proper db") pickle_files = load(files_pickle_path) for f in pickle_files: self.files[f['hash']] = [f] def rebuild_names_db(self): print("Building names database") try: rm(self.names_db_path) except FileNotFoundError as e: pass self.file_names = SqliteDict(self.names_db_path, autocommit=False) for hash in self.files: for document in self.files[hash]: try: name = document['name'] db_name_documents = self.file_names[name] except KeyError as e: db_name_documents = [] db_name_documents.append(document) self.file_names[name] = db_name_documents print("read {} entries".format(len(self.files))) def save(self): """Save db Formats db in a format compatible with trovotutto, builds the trovotutto index and then save the following to disk: - search index - files list - configuration """ #pgpgram_db = PGPgramDb(self, filetype="any", exclude=[], update=True) #self.index = Index(pgpgram_db, slb=3, verbose=self.verbose) #save(self.index, path_join(self.data_path, "index.pkl")) self.files.commit() self.file_names.commit() save(self.config, path_join(self.config_path, "config.pkl")) def search(self, query, path=getcwd(), filetype="any", exclude=[], results_number=10, reverse=True, verbose=0): if filetype != "any" or path != getcwd(): word_shortest = min([len(w) for w in query.split(" ")]) pgpgram_db_kwargs = {'path': path, 'filetype': filetype, 'exclude': exclude, 'update': True} # To update for db usage #pgpgram_db = PGPgramDb(self, **pgpgram_db_kwargs) #self.index = Index(pgpgram_db, slb=word_shortest, verbose=verbose) #results = self.index.search(query) #self.display_results(results[:results_number], reverse=reverse) # if results != []: # choice = int(input("Select file to restore (number): ")) # f = next(self.files[d][0] for d in self.files if self.files[d][0]['path'] == results[choice])["name"] # restore = Restore(f, download_directory=getcwd(), verbose=verbose) def display_results(self, results, reverse=True): lines = [] for i,f in enumerate(results): g = f.split("/") result = {"title": "{}{}. {}{}{}".format(color.GREEN + color.BOLD, i, color.BLUE, g[-1], color.END), "subtitle": "{}{}{}\n".format(color.GRAY, f, color.END)} lines.append(result) if reverse: lines.reverse() for result in lines: print(result['title']) print(result['subtitle']) def import_file(self, filename): if filename.endswith("pkl"): files = load(filename) for f in files: try: self.files[f['hash']] except KeyError as e: self.files[f['hash']] = [f] print("adding {}".format(f['name'])) else: files = SqliteDict(filename, autocommit=False) for k in files: try: self.files[k] except KeyError as e: self.files[k] = files[k] print("adding {}".format(f['name'])) self.rebuild_names_db() self.save()
class Gdrive: def __init__(self, config, token_path, cache_path): self.cfg = config self.token_path = token_path self.cache_path = cache_path self.token = None self.cache = None def first_run(self): # token file if not os.path.exists(self.token_path): # token.json does not exist, lets do the first run auth process print( "Visit %s and authorize against the account you wish to use" % self.authorize_url()) auth_code = raw_input('Enter authorization code: ') if self.first_access_token(auth_code) and self.token is not None: self.dump_token() else: logger.error( "Failed to authorize with the supplied client_id/client_secret/auth_code..." ) return False else: self.token = utils.load_json(self.token_path) # cache file self.cache = SqliteDict(self.cache_path, tablename='cache', encode=json.dumps, decode=json.loads, autocommit=False) return True def authorize_url(self): payload = { 'client_id': self.cfg['GDRIVE']['CLIENT_ID'], 'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob', 'response_type': 'code', 'access_type': 'offline', 'scope': 'https://www.googleapis.com/auth/drive' } url = 'https://accounts.google.com/o/oauth2/v2/auth?' + urlencode( payload) return url def first_access_token(self, auth_code): logger.info("Requesting access token for auth code %r", auth_code) payload = { 'code': auth_code, 'client_id': self.cfg['GDRIVE']['CLIENT_ID'], 'client_secret': self.cfg['GDRIVE']['CLIENT_SECRET'], 'grant_type': 'authorization_code', 'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob', } success, resp, data = self._make_request( 'https://www.googleapis.com/oauth2/v4/token', data=payload, headers={}, request_type='post') if success and resp.status_code == 200: logger.info("Retrieved first access token!") self.token = data self.token['page_token'] = '' return True else: logger.error("Error retrieving first access_token:\n%s", data) return False def refresh_access_token(self): logger.debug("Renewing access token...") payload = { 'refresh_token': self.token['refresh_token'], 'client_id': self.cfg['GDRIVE']['CLIENT_ID'], 'client_secret': self.cfg['GDRIVE']['CLIENT_SECRET'], 'grant_type': 'refresh_token', } success, resp, data = self._make_request( 'https://www.googleapis.com/oauth2/v4/token', data=payload, headers={}, request_type='post') if success and resp.status_code == 200 and 'access_token' in data: logger.info("Renewed access token!") refresh_token = self.token['refresh_token'] page_token = self.token['page_token'] self.token = data if 'refresh_token' not in self.token or not self.token[ 'refresh_token']: self.token['refresh_token'] = refresh_token self.token['page_token'] = page_token self.dump_token() return True else: logger.error("Error renewing access token:\n%s", data) return False def get_changes_first_page_token(self): success, resp, data = self._make_request( 'https://www.googleapis.com/drive/v3/changes/startPageToken', params={'supportsTeamDrives': self.cfg['GDRIVE']['TEAMDRIVE']}) if success and resp.status_code == 200: if 'startPageToken' not in data: logger.error( "Failed to retrieve startPageToken from returned startPageToken:\n%s", data) return False self.token['page_token'] = data['startPageToken'] self.dump_token() return True else: logger.error("Error retrieving first page token:\n%s", data) return False def get_changes(self): success, resp, data = self._make_request( 'https://www.googleapis.com/drive/v3/changes', params={ 'pageToken': self.token['page_token'], 'pageSize': 1000, 'includeRemoved': True, 'includeTeamDriveItems': self.cfg['GDRIVE']['TEAMDRIVE'], 'supportsTeamDrives': self.cfg['GDRIVE']['TEAMDRIVE'], 'fields': 'changes(file(md5Checksum,mimeType,modifiedTime,' 'name,parents,teamDriveId,trashed),' 'fileId,removed,teamDrive(id,name),' 'teamDriveId),newStartPageToken,nextPageToken' }) if success and resp.status_code == 200: # page token logic if data is not None and 'nextPageToken' in data: self.token['page_token'] = data['nextPageToken'] self.dump_token() elif data is not None and 'newStartPageToken' in data: self.token['page_token'] = data['newStartPageToken'] self.dump_token() else: logger.error( "Unexpected response while polling for changes from page %s:\n%s", str(self.token['page_token']), data) return False, data return True, data else: logger.error("Error getting page changes for page_token %r:\n%s", self.token['page_token'], data) return False, data def get_id_metadata(self, item_id, teamdrive_id=None): # return cache from metadata if available cached_metadata = self._get_cached_metdata(item_id) if cached_metadata: return True, cached_metadata # does item_id match teamdrive_id? if teamdrive_id is not None and item_id == teamdrive_id: success, resp, data = self._make_request( 'https://www.googleapis.com/drive/v3/teamdrives/%s' % str(item_id)) if success and resp.status_code == 200 and 'name' in data: # we successfully retrieved this teamdrive info, lets place a mimeType key in the result # so we know it needs to be cached data['mimeType'] = 'application/vnd.google-apps.folder' else: # retrieve file metadata success, resp, data = self._make_request( 'https://www.googleapis.com/drive/v3/files/%s' % str(item_id), params={ 'supportsTeamDrives': self.cfg['GDRIVE']['TEAMDRIVE'], 'fields': 'id,md5Checksum,mimeType,modifiedTime,name,parents,' 'trashed,teamDriveId' }) if success and resp.status_code == 200: return True, data else: logger.error("Error retrieving metadata for item %r:\n%s", item_id, data) return False, data def get_id_file_paths(self, item_id, teamdrive_id=None): file_paths = [] added_to_cache = 0 try: def get_item_paths(obj_id, path, paths, new_cache_entries, teamdrive_id=None): success, obj = self.get_id_metadata(obj_id, teamdrive_id) if not success: return new_cache_entries teamdrive_id = teamdrive_id if 'teamDriveId' not in obj else obj[ 'teamDriveId'] # add item object to cache if we know its not from cache if 'mimeType' in obj: # we know this is a new item fetched from the api, because the cache does not store this field self.add_item_to_cache( obj['id'], obj['name'], [] if 'parents' not in obj else obj['parents']) new_cache_entries += 1 if path.strip() == '': path = obj['name'] else: path = os.path.join(obj['name'], path) if 'parents' in obj and obj['parents']: for parent in obj['parents']: new_cache_entries += get_item_paths( parent, path, paths, new_cache_entries, teamdrive_id) if (not obj or 'parents' not in obj or not obj['parents']) and len(path): paths.append(path) return new_cache_entries return new_cache_entries added_to_cache += get_item_paths(item_id, '', file_paths, added_to_cache, teamdrive_id) if added_to_cache: logger.debug("Dumping cache due to new entries!") self.dump_cache() if len(file_paths): return True, file_paths else: return False, file_paths except Exception: logger.exception("Exception retrieving filepaths for '%s': ", item_id) return False, [] # cache def add_item_to_cache(self, item_id, item_name, item_parents): if self.cfg['GDRIVE'][ 'SHOW_CACHE_MESSAGES'] and item_id not in self.cache: logger.info("Added '%s' to cache: %s", item_id, item_name) self.cache[item_id] = {'name': item_name, 'parents': item_parents} return def remove_item_from_cache(self, item_id): if self.cache.pop(item_id, None): return True return False # dump jsons def dump_token(self): utils.dump_json(self.token_path, self.token) return def dump_cache(self): self.cache.commit() return ############################################################ # INTERNALS ############################################################ # cache def _get_cached_metdata(self, item_id): if item_id in self.cache: return self.cache[item_id] return None # requests @backoff.on_predicate(backoff.expo, lambda x: not x[0] and ('error' in x[2] and 'code' in x[2]['error'] and x[2] ['error']['code'] != 401), max_tries=8) def _make_request(self, url, headers=None, data=None, params=None, request_type='get'): refreshed_token = False while True: if headers is None and self.token: auth_headers = { 'Authorization': 'Bearer %s' % self.token['access_token'], } else: auth_headers = {} resp = None if request_type == 'get': resp = requests.get( url, params=params, headers=headers if headers is not None else auth_headers, timeout=30) elif request_type == 'post': resp = requests.post( url, data=data, headers=headers if headers is not None else auth_headers, timeout=30) else: return False, resp, { 'error': { 'code': 401, 'message': 'Invalid request_type was supplied to _make_request' } } # response logic try: data = resp.json() except ValueError: logger.exception( "Exception while decoding response from Google Drive for data:\n%s\nTraceback: ", resp.text) return False, resp, { 'error': { 'code': resp.status_code, 'message': 'Failed to json decode Google Drive response' } } if 'error' in data and 'code' in data['error'] and ( 'message' in data['error'] and 'Invalid Credentials' in data['error']['message']): # the token has expired. if not refreshed_token: refreshed_token = True self.refresh_access_token() continue else: # attempt was already made to refresh token return False, resp, data if resp.status_code == 200: return True, resp, data else: return False, resp, data
class Tree_control(object): """列表加载窗体""" def __init__(self, tree, eblog): self.newroot = tk.Toplevel() self.newroot.title('加载列表') self.newroot.iconbitmap("favicon.ico") self.newroot.wm_attributes('-topmost', 1) win_width = self.newroot.winfo_screenwidth() win_higth = self.newroot.winfo_screenheight() width_adjust = (win_width - 400) / 2 higth_adjust = (win_higth - 250) / 2 self.newroot.geometry("%dx%d+%d+%d" % (400, 250, width_adjust, higth_adjust)) # 进度条 self.__showFlag = True self.__width = 300 self.__heigth = 20 self.__sleep = 0 self.bar = ttk.Progressbar(self.newroot, length=self.__width, mode="indeterminate", orient=tk.HORIZONTAL) self.bar.pack(expand=True) self.bar.start(10) # 提示内容 self.content2 = tk.Label(self.newroot, text="正在加载列表中,请不要中断操作,请耐心等待......") self.content2.place( x=50, y=30, ) self.content = tk.Label(self.newroot, text="") self.content.place( x=50, y=60, ) self.eblog = eblog self.tree = tree self.mydict = SqliteDict('./my_db.sqlite', autocommit=True) # 开启处理线程 self.p = Thread(target=self.add_item) self.p.setDaemon(True) self.p.start() # 点击关闭右上角 self.newroot.protocol("WM_DELETE_WINDOW", self.close) # 加载item def add_item(self): len_items = len(sorted(self.mydict.iteritems())) i = 0 for key, value in sorted(self.mydict.iteritems()): i = i + 1 self.content.config(text="当前正处理:第" + str(i) + "个,共有" + str(len_items) + "个链接") self.tree.insert("", 0, iid=value[4], values=(value[0], value[1], value[2], value[3])) self.close() return 1 def close(self): self.newroot.destroy()
class SqliteRecorder(BaseRecorder): """ Recorder that saves cases in an SQLite dictionary. Args ---- sqlite_dict_args : dict Dictionary lf any additional arguments for the SQL db. Options ------- options['record_metadata'] : bool(True) Tells recorder whether to record variable attribute metadata. options['record_unknowns'] : bool(True) Tells recorder whether to record the unknowns vector. options['record_params'] : bool(False) Tells recorder whether to record the params vector. options['record_resids'] : bool(False) Tells recorder whether to record the ressiduals vector. options['record_derivs'] : bool(True) Tells recorder whether to record derivatives that are requested by a `Driver`. options['includes'] : list of strings Patterns for variables to include in recording. options['excludes'] : list of strings Patterns for variables to exclude in recording (processed after includes). """ def __init__(self, out, **sqlite_dict_args): super(SqliteRecorder, self).__init__() if MPI and MPI.COMM_WORLD.rank > 0: self._open_close_sqlitedict = False else: self._open_close_sqlitedict = True if self._open_close_sqlitedict: sqlite_dict_args.setdefault('autocommit', True) self.out = SqliteDict(filename=out, flag='n', tablename='openmdao', **sqlite_dict_args) self.out_derivs = SqliteDict(filename=out, flag='w', tablename='openmdao_derivs', **sqlite_dict_args) else: self.out = None def record_metadata(self, group): """Stores the metadata of the given group in a sqlite file using the variable name for the key. Args ---- group : `System` `System` containing vectors """ params = group.params.iteritems() #resids = group.resids.iteritems() unknowns = group.unknowns.iteritems() data = OrderedDict([ ('format_version', format_version), ('Parameters', dict(params)), ('Unknowns', dict(unknowns)), ]) self.out['metadata'] = data def record_iteration(self, params, unknowns, resids, metadata): """ Stores the provided data in the sqlite file using the iteration coordinate for the key. Args ---- params : dict Dictionary containing parameters. (p) unknowns : dict Dictionary containing outputs and states. (u) resids : dict Dictionary containing residuals. (r) metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata['coord'] timestamp = metadata['timestamp'] group_name = format_iteration_coordinate(iteration_coordinate) data['timestamp'] = timestamp data['success'] = metadata['success'] data['msg'] = metadata['msg'] if self.options['record_params']: data['Parameters'] = self._filter_vector(params, 'p', iteration_coordinate) if self.options['record_unknowns']: data['Unknowns'] = self._filter_vector(unknowns, 'u', iteration_coordinate) if self.options['record_resids']: data['Residuals'] = self._filter_vector(resids, 'r', iteration_coordinate) self.out[group_name] = data def record_derivatives(self, derivs, metadata): """Writes the derivatives that were calculated for the driver. Args ---- derivs : dict or ndarray depending on the optimizer Dictionary containing derivatives metadata : dict, optional Dictionary containing execution metadata (e.g. iteration coordinate). """ data = OrderedDict() iteration_coordinate = metadata['coord'] timestamp = metadata['timestamp'] group_name = format_iteration_coordinate(iteration_coordinate) data['timestamp'] = timestamp data['success'] = metadata['success'] data['msg'] = metadata['msg'] data['Derivatives'] = derivs self.out_derivs[group_name] = data def close(self): """Closes `out`""" if self._open_close_sqlitedict: if self.out is not None: self.out.close() self.out = None if self.out_derivs is not None: self.out_derivs.close() self.out_derivs = None
def file_reader_generator(file_object): while True: data = file_object.readline() if not data: break yield data print('Loading claimToDocsDict') claimToDocsDict_f = open('claimToDocsDict_train.pickle', 'rb') claimToDocsDict = pickle.load(claimToDocsDict_f) claimToDocsDict_f.close() print('Loading Claims') training_db = SqliteDict('training_db.sqlite', decode=decompress_set) print('Loading wiki corpus') conn = sqlite3.connect('wiki_corpus.db') c = conn.cursor() def flatten_list(lst): flattened = [item for nstd in lst for item in nstd] return flattened translator = str.maketrans('', '', string.punctuation) def tokenise_line(line):
def __init__(self): SqliteDict.__init__(self, filename=ActiniaConfig.GRAPH_DB, autocommit=True)
def load(self, ctx): self.guild_configs = SqliteDict('./guild_configs.sqlite', autocommit=True) self.signups = SqliteDict('./signups.sqlite', autocommit=True)
import os import pandas as pd from math import inf from sqlitedict import SqliteDict from statsmodels.tsa.ar_model import AutoReg cache = SqliteDict('precompute.db', autocommit=True) # A US manufacturer buys raw materials in multiple currencies purchases = pd.read_excel('Purchases.xlsx') # For each of those currencies, find the best model to forecast prices best_model = {} for currency in purchases.currency: print('Currency', currency) data = pd.read_excel(f'{currency}.xlsx') data = data[data[currency] > 0] best_aic, best_fit, best_lags = inf, None, None check_lags = cache.get( currency, (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095)) for lags in check_lags: print(' Lags', lags) model = AutoReg(data[currency], lags=lags) fit = model.fit() if fit.aic < best_aic: best_aic, best_fit, best_lags = fit.aic, fit, lags cache[currency] = (best_lags, ) best_model[currency] = best_fit # Estimate next month's price increase assuming the same volume as today forecasted_value = 0 for index, row in purchases.iterrows():
# Creates a sqlite for each category from sqlitedict import SqliteDict splits = [i*10000000 for i in range(0, 8)] source = './../data/sqlite/split_texts/' path = "./../data/sqlite/community_texts/" actual_category = "none" category_dict = SqliteDict(f"{path}AL.sqlite", tablename="value", journal_mode="OFF") text_dict = SqliteDict(f"{source}text_dict_{0}.sqlite", tablename="value", flag="r") c = 0 for num in splits: category_dict.commit() text_dict.close() text_dict = SqliteDict(f"{source}text_dict_{num}.sqlite", tablename="value", flag="r") print(num) for id_c, value in text_dict.items(): if value["category"] != actual_category: category_dict.commit() category_dict.close() category_dict = SqliteDict(f"{path}{value['category']}.sqlite", tablename="value", journal_mode="OFF") category_dict[id_c] = value category_dict.commit() category_dict.close()
def init_db(base_path): mydict = SqliteDict(base_path, autocommit=True) return mydict