class DiskQueue(): DIR_PATH = './diskqueue' IN_PROGRESS_DB_NAME = 'inprogress.sqlite' TODO_DB_NAME = 'todo.sqlite' SEEN_DB_NAME = 'seen.sqlite' def __init__(self, load: bool = False): self.iterLock = threading.Lock() if not os.path.exists(self.DIR_PATH): os.makedirs(self.DIR_PATH) if not load: for path in [self.IN_PROGRESS_DB_NAME, self.TODO_DB_NAME, self.SEEN_DB_NAME]: try: os.remove('{}/{}'.format(self.DIR_PATH, path)) except: continue self.inProgress = SqliteDict('{}/{}'.format(self.DIR_PATH, self.IN_PROGRESS_DB_NAME), autocommit=True) self.todo = SqliteDict('{}/{}'.format(self.DIR_PATH, self.TODO_DB_NAME), autocommit=True) self.seen = SqliteDict('{}/{}'.format(self.DIR_PATH, self.SEEN_DB_NAME), autocommit=True) # If we need to load state, add everything that was in progress to the todo queue if load: for key in self.inProgress.iterkeys(): self.todo[key] = True del self.inProgress[key] def Push(self, key): if (key not in self.todo) and (key not in self.inProgress) and (key not in self.seen): self.todo[key] = True def Next(self): toReturn = None with self.iterLock: toReturn = next(self.todo.keys(), None) if toReturn: self.inProgress[toReturn] = True del self.todo[toReturn] return toReturn def Done(self, key): self.seen[key] = True del self.inProgress[key] def Close(self): self.inProgress.close() self.todo.close() self.seen.close() def IsDone(self): tmp = False with self.iterLock: tmp = len(self.todo) == 0 and len(self.inProgress) == 0 return tmp
from sqlitedict import SqliteDict import datetime import sys from pytimeparse.timeparse import timeparse import atexit import traceback app = Flask(__name__) config = yaml.load(open('config.yml')) machine_groups = {} for mg in config['machine_groups']: machine_groups[mg['name']] = mg machines = SqliteDict('./machines.sqlite', autocommit=True) print(list(machines.iterkeys())) task_queue = Queue() timeout_threads = {} def get_machines_in_group(group_name): ret = [] for _, m in machines.iteritems(): if m['machine_group']['name'] == group_name: ret.append(m) return ret def get_machine_name_by_ip(ip):
def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False): total_start = timeit.default_timer() make_index_time = 0 make_dict_time = 0 make_lda_time = 0 make_tfidf_time = 0 sim_time = 0 if index_dic: f = [i.split(',') for i in texts.readlines()] logging.info('Create id & ac_id list') ids = [f[i][0] for i in range(len(f))] ac_ids = [f[i][1] for i in range(len(f))] logging.info('Create contents list') contents = [] for i in range(len(f)): if len(f[i]) == 3: contents.append(f[i][2].strip().split(':')) else: contents.append([]) # make index logging.info('***********Now merge index by sqlitedict***********') timer_start = timeit.default_timer() old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus')) pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids) paid2pos_new = {} for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)): paid2pos_new.update({int(key): [i[0] for i in paid]}) id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f)))) pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids)) id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True) id2pos.update(id2pos_new) id2pos.close() pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True) pos2id.update(pos2id_new) pos2id.close() paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True) x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])] for i in list(set.intersection(*x)): # update duplicate key temp = list(chain(paid2pos[i], paid2pos_new[i])) paid2pos.update({int(i): temp}) paid2pos.close() timer_end = timeit.default_timer() make_index_time = timer_end - timer_start # Merge dictionary logging.info('***********Now merge Dictionary***********') timer_start = timeit.default_timer() newDict = corpora.Dictionary(contents) newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None) dic = corpora.Dictionary.load(gl.res + '/resource/dict') dic.merge_with(newDict) dic.save(gl.res + '/resource/dict') timer_end = timeit.default_timer() make_dict_time = timer_end - timer_start # merge corpus logging.info('***********Now merge Corpus***********') temps = [] for i, t in enumerate(contents): temps.append(dic.doc2bow(t)) if i % 10000 == 0: logging.info('make corpus ' + str(i) + ' articles') corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps) gc.collect() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c') merged_corpus = chain(corpus, new_corpus) corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus) # Overwrite corpus for filename in glob.glob(gl.res + '/resource/*'): if filename.endswith('corpus') or filename.endswith('corpus.index') \ or filename.endswith('new_c') or filename.endswith('new_c.index'): # rm useless corpus # os.remove(filename) os.unlink(filename) if filename.endswith('merged_c'): # rename to corpus os.rename(filename, gl.res + '/resource/corpus') if filename.endswith('merged_c.index'): os.rename(filename, gl.res + '/resource/corpus.index') if tfidf: # do tfidf merge gc.collect() logging.info('***********Now merge TF-IDF model***********') timer_start = timeit.default_timer() for filename in glob.glob(gl.res + '/resource/*'): # backup old model if filename.endswith('tfidf'): os.rename(filename, filename + '_' + gl.c_time) corpus = corpora.MmCorpus(gl.res + '/resource/corpus') # reload corpus tfidf = models.TfidfModel(corpus) tfidf.save(gl.res + '/resource/tfidf') timer_end = timeit.default_timer() make_tfidf_time = timer_end - timer_start if lda: # do lda merge gc.collect() tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf') corpus = corpora.MmCorpus(gl.res + '/resource/corpus') corpus_tfidf = tfidf[corpus] dic = corpora.Dictionary.load(gl.res + '/resource/dict') logging.info('***********Now merge LDA model***********') timer_start = timeit.default_timer() for filename in glob.glob(gl.res + '/resource/*'): # backup old model if filename.endswith('lda') or filename.endswith('lda.state'): os.rename(filename, filename + '_' + gl.c_time) # lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, # num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes) lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize, num_topics=gl.topicCount, passes=gl.lda_passes) lda.save(gl.res + '/resource/lda') timer_end = timeit.default_timer() make_lda_time = timer_end - timer_start logging.info('lda training cost %.2f seconds' % make_lda_time) if sim: gc.collect() logging.info('***********Now Make Similarity Index***********') st = timeit.default_timer() corpus = corpora.MmCorpus(gl.res + '/resource/corpus') lda = models.LdaModel.load(gl.res + '/resource/lda') index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount) index.save(gl.res + '/resource/simIndex') sim_time = timeit.default_timer() - st total_end = timeit.default_timer() total_time = total_end - total_start m = divmod(total_time, 60) h = divmod(m[0], 60) logging.info('\nMerge LDA Model complete!!!\n' '***Using time*** \n' 'index training {:.2f}\n' 'dict training {:.2f}\n' 'tfidf training {:.2f}\n' 'lda training {:.2f}\n' 'sim training {:.2f}\n' 'Total time: {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time, make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))
class Cache(object): def __init__(self, **kwargs): self.name = kwargs.get('name', DEFAULT_DATABASE_NAME) self._db = SqliteDict('./%s.sqlite' % self.name, encode=cache_encode, decode=cache_decode, autocommit=True) logger.info('name=%s size=%s', self.name, len(self._db)) def close(self): self._db.close() def __del__(self): pass # self.close() def __iter__(self): return self._db.__iter__() def iteritems(self): return self._db.iteritems() def iterkeys(self): return self._db.iterkeys() def itervalues(self): return self._db.itervalues() def items(self): return self.iteritems() def __getitem__(self, key): return self._db.__getitem__(key) def get(self, key, default=None): try: return self.__getitem__(key) except KeyError: return default def __setitem__(self, key, value): return self._db.__setitem__(key, value) def set(self, key, value): return self.__setitem__(key, value) def __len__(self): return len(self._db) def __contains__(self, key): return key in self._db def __delitem__(self, key): if key in self._db: del self._db[key] def list(self, limit=None): logger.info('list') keys = [] for key in self: keys.append(key) if limit is not None and len(keys) == limit: break return keys