def _persist_v0(file_path, zg): print 'Creating db...' persisted = SqliteDict(file_path, autocommit=False) print 'Updating data...' persisted.update(zg.country_postal_codes) print 'Committing data...' persisted.commit()
def main(data_dir): print 'Loading data...' zg = Zipgun(data_dir, force_text=True) print 'Creating db...' persisted = SqliteDict(os.path.join(data_dir, DATA_FILE), autocommit=False) print 'Updating data...' persisted.update(zg.country_postal_codes) print 'Committing data...' persisted.commit()
def _persist_v1(file_path, zg): print 'Creating meta db...' zipgun_info = SqliteDict( file_path, tablename='zipgun_info', autocommit=False) zipgun_info['version'] = 1 zipgun_info['country_codes'] = zg.country_postal_codes.keys() zipgun_info.commit() for country_code in zg.country_postal_codes: print 'Creating {} db...'.format(country_code) country_data = SqliteDict( file_path, tablename='zg_{}'.format(country_code), autocommit=False) country_data.update(zg.country_postal_codes[country_code]) country_data.commit() time.sleep(1.0) # Pretty bullshit country_data.close() zipgun_info.close()
def reset(texts, index_dic=True, tfidf=True, hdp=False, lda=True, sim=False): total_start = timeit.default_timer() make_index_time = 0 make_dict_time = 0 make_lda_time = 0 make_tfidf_time = 0 sim_time = 0 hdptopicnum = 0 if index_dic: f = [i.split(',') for i in texts.readlines()] logging.info('Create id & ac_id list') ids = [f[i][1] for i in range(len(f))] ac_ids = [f[i][0] for i in range(len(f))] logging.info('Create contents list') contents = [] for i in range(len(f)): if len(f[i]) == 3: contents.append(f[i][2].strip().split(':')) else: contents.append([]) # make index logging.info('***********Now Make Index by sqlitedict***********') timer_start = timeit.default_timer() pos2paid = zip(range(len(f)), ac_ids) paid2pos_rel = {} for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)): paid2pos_rel.update({int(key): [i[0] for i in paid]}) id2pos_rel = dict(zip(ids, range(len(f)))) pos2id_rel = dict(zip(range(len(f)), ids)) id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True) id2pos.clear() id2pos.update(id2pos_rel) id2pos.close() pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True) pos2id.clear() pos2id.update(pos2id_rel) pos2id.close() paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True) paid2pos.clear() paid2pos.update(paid2pos_rel) paid2pos.close() timer_end = timeit.default_timer() make_index_time = timer_end - timer_start # make dict logging.info('***********Now Make Dictionary***********') timer_start = timeit.default_timer() dic = corpora.Dictionary(contents) ############## optimized dictionary dic.filter_extremes(no_below=20, no_above=0.1, keep_n=None) ############## dic.save(gl.res + '/resource/dict') timer_end = timeit.default_timer() make_dict_time = timer_end - timer_start # make corpus logging.info('***********Now Make Corpus***********') temps = [] for i, t in enumerate(contents): temps.append(dic.doc2bow(t)) if i % 10000 == 0: logging.info('make corpus ' + str(i) + ' articles') corpus = temps corpora.MmCorpus.serialize(gl.res + '/resource/corpus', corpus) if tfidf: # do tfidf train logging.info('***********Now Training TF-IDF Model***********') timer_start = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') tfidf = models.TfidfModel(corpus) tfidf.save(gl.res + '/resource/tfidf') timer_end = timeit.default_timer() make_tfidf_time = timer_end - timer_start if hdp: gc.collect() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') dic = corpora.Dictionary.load(gl.res + '/resource/dict') hdpmodel = models.hdpmodel.HdpModel(corpus, id2word=dic) hdptopicnum = len(hdpmodel.print_topics(topics=-1, topn=10)) logging.info('hdptopicnum is {}'.format(hdptopicnum)) if lda: # do lda train gc.collect() tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf') corpus = corpora.MmCorpus(gl.res + '/resource/corpus') dic = corpora.Dictionary.load(gl.res + '/resource/dict') corpus_tfidf = tfidf[corpus] logging.info('***********Now Training LDA Model***********') timer_start = timeit.default_timer() if not hdptopicnum == 0: gl.topicCount = hdptopicnum lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes) # lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, # num_topics=gl.topicCount, passes=gl.lda_passes, distributed=True) lda.save(gl.res + '/resource/lda') timer_end = timeit.default_timer() make_lda_time = timer_end - timer_start logging.info('lda training cost %.2f seconds' % make_lda_time) if sim: gc.collect() logging.info('***********Now Make Similarity Index***********') st = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') lda = models.LdaModel.load(gl.res + '/resource/lda') index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount) index.save(gl.res + '/resource/simIndex') sim_time = timeit.default_timer() - st total_end = timeit.default_timer() total_time = total_end - total_start m = divmod(total_time, 60) h = divmod(m[0], 60) logging.info('\nReset LDA Model complete!!!\n' '***Using time*** \n' 'index training {:.2f}\n' 'dict training {:.2f}\n' 'tfidf training {:.2f}\n' 'lda training {:.2f}\n' 'sim training {:.2f}\n' 'Total time: {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time, make_lda_time, sim_time, int(h[0]), int(h[1]), m[1])) basicConfig = open(gl.res + '/resource/basicConfig.txt', mode='w+') basicConfig.write('FileName: {}' '\nTopicNumber = {}' '\nestTopicNumber = {}' '\nldaPasses = {}' .format(os.path.basename(texts.name), gl.topicCount, hdptopicnum, gl.lda_passes)) basicConfig.close()
def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False): total_start = timeit.default_timer() make_index_time = 0 make_dict_time = 0 make_lda_time = 0 make_tfidf_time = 0 sim_time = 0 if index_dic: f = [i.split(',') for i in texts.readlines()] logging.info('Create id & ac_id list') ids = [f[i][0] for i in range(len(f))] ac_ids = [f[i][1] for i in range(len(f))] logging.info('Create contents list') contents = [] for i in range(len(f)): if len(f[i]) == 3: contents.append(f[i][2].strip().split(':')) else: contents.append([]) # make index logging.info('***********Now merge index by sqlitedict***********') timer_start = timeit.default_timer() old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus')) pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids) paid2pos_new = {} for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)): paid2pos_new.update({int(key): [i[0] for i in paid]}) id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f)))) pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids)) id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True) id2pos.update(id2pos_new) id2pos.close() pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True) pos2id.update(pos2id_new) pos2id.close() paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True) x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])] for i in list(set.intersection(*x)): # update duplicate key temp = list(chain(paid2pos[i], paid2pos_new[i])) paid2pos.update({int(i): temp}) paid2pos.close() timer_end = timeit.default_timer() make_index_time = timer_end - timer_start # Merge dictionary logging.info('***********Now merge Dictionary***********') timer_start = timeit.default_timer() newDict = corpora.Dictionary(contents) newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None) dic = corpora.Dictionary.load(gl.res + '/resource/dict') dic.merge_with(newDict) dic.save(gl.res + '/resource/dict') timer_end = timeit.default_timer() make_dict_time = timer_end - timer_start # merge corpus logging.info('***********Now merge Corpus***********') temps = [] for i, t in enumerate(contents): temps.append(dic.doc2bow(t)) if i % 10000 == 0: logging.info('make corpus ' + str(i) + ' articles') corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps) gc.collect() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c') merged_corpus = chain(corpus, new_corpus) corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus) # Overwrite corpus for filename in glob.glob(gl.res + '/resource/*'): if filename.endswith('corpus') or filename.endswith('corpus.index') \ or filename.endswith('new_c') or filename.endswith('new_c.index'): # rm useless corpus # os.remove(filename) os.unlink(filename) if filename.endswith('merged_c'): # rename to corpus os.rename(filename, gl.res + '/resource/corpus') if filename.endswith('merged_c.index'): os.rename(filename, gl.res + '/resource/corpus.index') if tfidf: # do tfidf merge gc.collect() logging.info('***********Now merge TF-IDF model***********') timer_start = timeit.default_timer() for filename in glob.glob(gl.res + '/resource/*'): # backup old model if filename.endswith('tfidf'): os.rename(filename, filename + '_' + gl.c_time) corpus = corpora.MmCorpus(gl.res + '/resource/corpus') # reload corpus tfidf = models.TfidfModel(corpus) tfidf.save(gl.res + '/resource/tfidf') timer_end = timeit.default_timer() make_tfidf_time = timer_end - timer_start if lda: # do lda merge gc.collect() tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf') corpus = corpora.MmCorpus(gl.res + '/resource/corpus') corpus_tfidf = tfidf[corpus] dic = corpora.Dictionary.load(gl.res + '/resource/dict') logging.info('***********Now merge LDA model***********') timer_start = timeit.default_timer() for filename in glob.glob(gl.res + '/resource/*'): # backup old model if filename.endswith('lda') or filename.endswith('lda.state'): os.rename(filename, filename + '_' + gl.c_time) # lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, # num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes) lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, num_topics=gl.topicCount, passes=gl.lda_passes) lda.save(gl.res + '/resource/lda') timer_end = timeit.default_timer() make_lda_time = timer_end - timer_start logging.info('lda training cost %.2f seconds' % make_lda_time) if sim: gc.collect() logging.info('***********Now Make Similarity Index***********') st = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') lda = models.LdaModel.load(gl.res + '/resource/lda') index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount) index.save(gl.res + '/resource/simIndex') sim_time = timeit.default_timer() - st total_end = timeit.default_timer() total_time = total_end - total_start m = divmod(total_time, 60) h = divmod(m[0], 60) logging.info('\nMerge LDA Model complete!!!\n' '***Using time*** \n' 'index training {:.2f}\n' 'dict training {:.2f}\n' 'tfidf training {:.2f}\n' 'lda training {:.2f}\n' 'sim training {:.2f}\n' 'Total time: {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time, make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))
return now.date() if __name__ == '__main__': update_stock_code() stock_codes = get_all_code() agency_db = SqliteDict(Path('..') / 'db' / 'agency_db.sqlite', autocommit=True) for stock_code in tqdm(stock_codes): today = get_businessday() if stock_code not in agency_db: agency_db.update({ stock_code: { 'agency_meta': pd.DataFrame(), 'agency_detail': pd.DataFrame() } }) is_not_duplicated = today not in agency_db[stock_code][ 'agency_detail'].index if is_not_duplicated: agency_volume_meta, agency_volume_detail = crawl_agency_volume( stock_code) if not agency_volume_meta.empty: agency_volume_meta_ = update_diff_only( agency_db[stock_code]['agency_meta'], agency_volume_meta) agency_volume_detail_ = update_diff_only( agency_db[stock_code]['agency_detail'], agency_volume_detail) agency_db.update({
def main(result_file, site_file, constant_modification_list=None, variable_modification_list=None, enzyme_info=None, n_processes=4, output_file=None): if output_file is None: # output_file = os.path.splitext(result_file)[0] + '.theoretical_ions' output_file = os.path.splitext(result_file)[0] + ".db" else: output_file += ".db" modification_table = RestrictedModificationTable.bootstrap(constant_modification_list, variable_modification_list) if constant_modification_list is None and variable_modification_list is None: modification_table = ModificationTable.bootstrap() if isinstance(site_file, basestring): site_list = [line.strip() for line in open(site_file, "r")] site_list = list(map(int, site_list)) else: site_list = site_file compo_dict = csv.DictReader(open(result_file, "r"), delimiter=",") colnames = compo_dict.fieldnames glycan_identity = get_glycan_identities(colnames) enzyme_info = map(get_enzyme, enzyme_info) tag = datetime.datetime.strftime(datetime.datetime.now(), "%Y%m%d-%H%M%S") metadata = { "glycan_identities": glycan_identity, "constant_modifications": constant_modification_list, "variable_modifications": variable_modification_list, "site_list": site_list, "ms1_output_file": result_file, "enzyme": enzyme_info, "tag": tag, "enable_partial_hexnac_match": constants.PARTIAL_HEXNAC_LOSS } metadata_store = SqliteDict(output_file, tablename="metadata", flag='n') metadata_store.update(metadata) metadata_store.commit() theoretical_search_space_store = SqliteDict(output_file, tablename="theoretical_search_space") pool = multiprocessing.Pool(n_processes) task_fn = functools.partial(process_predicted_ms1_ion, modification_table=modification_table, site_list=site_list, glycan_identity=glycan_identity) cntr = 0 if n_processes > 1: logger.debug("Building theoretical sequences concurrently") for res in (itertools.chain.from_iterable(pool.imap(task_fn, compo_dict, chunksize=500))): theoretical_search_space_store[cntr] = res cntr += 1 else: logger.debug("Building theoretical sequences sequentially") for row in compo_dict: res = task_fn(row) for item in res: theoretical_search_space_store[cntr] = item cntr += 1 if (cntr % 10000) == 0: theoretical_search_space_store.commit() logger.info("Committing, %d records made", cntr) theoretical_search_space_store.commit() theoretical_search_space_store.close() pool.close() pool.join() pool.terminate() logger.info("Hypothesis building complete") return output_file
start_68 = datetime.datetime(year=2016, month=4, day=22) end_68 = datetime.datetime(year=2016, month=5, day=2) timestamp_68_start = unix_time_millis(start_68) timestamp_68_end = unix_time_millis(end_68) done_players = 0 if os.path.isfile("games.sqlite"): #with open("game_data.pkl", "rb") as fp: player_tiers = SqliteDict("player_tiers.sqlite", autocommit=True) done_games = SqliteDict("games.sqlite", autocommit=True) oldie = players.qsize() player_tiers.update({x['playerOrTeamId']:1 for x in masters['entries']}) players = Queue.PriorityQueue() a = [players.put((player_tiers[x], x)) for x in player_tiers] print "Queue length has been modified from %d to %d" % (oldie, players.qsize()) else: player_tiers = SqliteDict("player_tiers.sqlite") player_tiers.update({x['playerOrTeamId']:0 for x in challenger['entries']}) done_games = SqliteDict("games.sqlite") gold=False #for i in range(2090): players.get(); done_players += 1 # 19942923 while not players.empty(): curr = players.get() if curr[0] > 3 and not gold: done_games = SqliteDict("games_gold.sqlite");gold=True
class Bucket(object): def __init__(self, bucket_name, storage_path=None): ''' Bucker init - if the bucket exists, meta parameter will be ignored ''' if bucket_name and isinstance(bucket_name, (str, unicode)) and re.match(r"^[a-z0-9\.\-_]+$", bucket_name, re.I): self._name = bucket_name.strip() else: raise falcon.HTTPInvalidParam( "The parameter shall contain only alpha-numeric characters, value: '%s'" % bucket_name, param_name='name' ) self._bucket_path = None if storage_path and os.path.exists(storage_path): self._bucket_path = os.path.join(storage_path, self._name) else: raise falcon.HTTPInternalServerError( title='IncorrectStoragePath', description='The storage path is incorrect, "%s"' % storage_path ) if self._bucket_path and os.path.exists(self._bucket_path): self._meta = SqliteDict(os.path.join(self._bucket_path,'metadata.sqlite'), 'bucket', autocommit=True) else: self._meta = SqliteDict(':memory:', 'bucket', autocommit=True) @property def bucket_path(self): return self._bucket_path @property def metadata(self): return dict(self._meta) @metadata.setter def metadata(self, value): if value and isinstance(value, dict): self._meta.update(value) else: raise RuntimeError('Incorrect metadata type. Found "%s", expected "dict"' % type(value)) def exists(self): ''' check if the bucket exists ''' if self.bucket_path and os.path.exists(self.bucket_path): return True else: return False def create(self): ''' create new bucket ''' if self.exists(): raise falcon.HTTPConflict( title='BucketAlreadyExists', description="The requested bucket name '%s' is not available. Please select a different name and try again." % self._name ) # prepare bucket directory try: os.makedirs(self.bucket_path) os.makedirs(os.path.join(self.bucket_path, 'data')) os.makedirs(os.path.join(self.bucket_path, 'tmp')) except IOError, err: raise falcon.HTTPInternalServerError( title='BucketCreationError', description='The path to bucket cannot be created, "%s"' % self.bucket_path ) # create metadata file in bucket directory _meta = self._meta self._meta = SqliteDict(os.path.join(self.bucket_path, 'metadata.sqlite'), 'bucket', autocommit=True) self._meta.update(_meta)