Esempio n. 1
0
class DiskQueue():
    DIR_PATH = './diskqueue'
    IN_PROGRESS_DB_NAME = 'inprogress.sqlite'
    TODO_DB_NAME = 'todo.sqlite'
    SEEN_DB_NAME = 'seen.sqlite'

    def __init__(self, load: bool = False):
        self.iterLock = threading.Lock()

        if not os.path.exists(self.DIR_PATH):
            os.makedirs(self.DIR_PATH)

        if not load:
            for path in [self.IN_PROGRESS_DB_NAME, self.TODO_DB_NAME, self.SEEN_DB_NAME]:
                try:
                    os.remove('{}/{}'.format(self.DIR_PATH, path))
                except:
                    continue

        self.inProgress = SqliteDict('{}/{}'.format(self.DIR_PATH, self.IN_PROGRESS_DB_NAME), autocommit=True)
        self.todo = SqliteDict('{}/{}'.format(self.DIR_PATH, self.TODO_DB_NAME), autocommit=True)
        self.seen = SqliteDict('{}/{}'.format(self.DIR_PATH, self.SEEN_DB_NAME), autocommit=True)

        # If we need to load state, add everything that was in progress to the todo queue
        if load:
            for key in self.inProgress.iterkeys():
                self.todo[key] = True
                del self.inProgress[key]

    def Push(self, key):
        if (key not in self.todo) and (key not in self.inProgress) and (key not in self.seen):
            self.todo[key] = True

    def Next(self):
        toReturn = None
        with self.iterLock:
            toReturn = next(self.todo.keys(), None)
            if toReturn:
                self.inProgress[toReturn] = True
                del self.todo[toReturn]
        return toReturn

    def Done(self, key):
        self.seen[key] = True
        del self.inProgress[key]

    def Close(self):
        self.inProgress.close()
        self.todo.close()
        self.seen.close()

    def IsDone(self):
        tmp = False
        with self.iterLock:
            tmp = len(self.todo) == 0 and len(self.inProgress) == 0
        return tmp
Esempio n. 2
0
from sqlitedict import SqliteDict
import datetime
import sys
from pytimeparse.timeparse import timeparse
import atexit
import traceback

app = Flask(__name__)

config = yaml.load(open('config.yml'))
machine_groups = {}
for mg in config['machine_groups']:
    machine_groups[mg['name']] = mg

machines = SqliteDict('./machines.sqlite', autocommit=True)
print(list(machines.iterkeys()))

task_queue = Queue()

timeout_threads = {}


def get_machines_in_group(group_name):
    ret = []
    for _, m in machines.iteritems():
        if m['machine_group']['name'] == group_name:
            ret.append(m)
    return ret


def get_machine_name_by_ip(ip):
Esempio n. 3
0
def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False):
    total_start = timeit.default_timer()
    make_index_time = 0
    make_dict_time = 0
    make_lda_time = 0
    make_tfidf_time = 0
    sim_time = 0

    if index_dic:
        f = [i.split(',') for i in texts.readlines()]
        logging.info('Create id & ac_id list')
        ids = [f[i][0] for i in range(len(f))]
        ac_ids = [f[i][1] for i in range(len(f))]
        logging.info('Create contents list')
        contents = []
        for i in range(len(f)):
            if len(f[i]) == 3:
                contents.append(f[i][2].strip().split(':'))
            else:
                contents.append([])

        # make index
        logging.info('***********Now merge index by sqlitedict***********')
        timer_start = timeit.default_timer()
        old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus'))
        pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids)
        paid2pos_new = {}
        for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)):
            paid2pos_new.update({int(key): [i[0] for i in paid]})
        id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f))))
        pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids))

        id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True)
        id2pos.update(id2pos_new)
        id2pos.close()
        pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True)
        pos2id.update(pos2id_new)
        pos2id.close()
        paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True)
        x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])]
        for i in list(set.intersection(*x)):  # update duplicate key
            temp = list(chain(paid2pos[i], paid2pos_new[i]))
            paid2pos.update({int(i): temp})
        paid2pos.close()
        timer_end = timeit.default_timer()
        make_index_time = timer_end - timer_start

        # Merge dictionary
        logging.info('***********Now merge Dictionary***********')
        timer_start = timeit.default_timer()
        newDict = corpora.Dictionary(contents)
        newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        dic.merge_with(newDict)
        dic.save(gl.res + '/resource/dict')
        timer_end = timeit.default_timer()
        make_dict_time = timer_end - timer_start

        # merge corpus
        logging.info('***********Now merge Corpus***********')
        temps = []
        for i, t in enumerate(contents):
            temps.append(dic.doc2bow(t))
            if i % 10000 == 0:
                logging.info('make corpus ' + str(i) + ' articles')
        corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps)
        gc.collect()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c')
        merged_corpus = chain(corpus, new_corpus)
        corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus)  # Overwrite corpus

        for filename in glob.glob(gl.res + '/resource/*'):
            if filename.endswith('corpus') or filename.endswith('corpus.index') \
                    or filename.endswith('new_c') or filename.endswith('new_c.index'):  # rm useless corpus
                # os.remove(filename)
                os.unlink(filename)
            if filename.endswith('merged_c'):  # rename to corpus
                os.rename(filename, gl.res + '/resource/corpus')
            if filename.endswith('merged_c.index'):
                os.rename(filename, gl.res + '/resource/corpus.index')

    if tfidf:
        # do tfidf merge
        gc.collect()
        logging.info('***********Now merge TF-IDF model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('tfidf'):
                os.rename(filename, filename + '_' + gl.c_time)
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')  # reload corpus
        tfidf = models.TfidfModel(corpus)
        tfidf.save(gl.res + '/resource/tfidf')
        timer_end = timeit.default_timer()
        make_tfidf_time = timer_end - timer_start

    if lda:
        # do lda merge
        gc.collect()
        tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf')
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        corpus_tfidf = tfidf[corpus]
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        logging.info('***********Now merge LDA model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('lda') or filename.endswith('lda.state'):
                os.rename(filename, filename + '_' + gl.c_time)
        # lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
        #                           num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes)
        lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
                              num_topics=gl.topicCount, passes=gl.lda_passes)
        lda.save(gl.res + '/resource/lda')
        timer_end = timeit.default_timer()
        make_lda_time = timer_end - timer_start
        logging.info('lda training cost %.2f seconds' % make_lda_time)

    if sim:
        gc.collect()
        logging.info('***********Now Make Similarity Index***********')
        st = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        lda = models.LdaModel.load(gl.res + '/resource/lda')
        index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount)
        index.save(gl.res + '/resource/simIndex')
        sim_time = timeit.default_timer() - st

    total_end = timeit.default_timer()
    total_time = total_end - total_start
    m = divmod(total_time, 60)
    h = divmod(m[0], 60)
    logging.info('\nMerge LDA Model complete!!!\n'
                 '***Using time*** \n'
                 'index training    {:.2f}\n'
                 'dict training     {:.2f}\n'
                 'tfidf training    {:.2f}\n'
                 'lda training      {:.2f}\n'
                 'sim training      {:.2f}\n'
                 'Total time:       {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time,
                                                                make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))
Esempio n. 4
0
class Cache(object):
    def __init__(self, **kwargs):
        self.name = kwargs.get('name', DEFAULT_DATABASE_NAME)
        self._db = SqliteDict('./%s.sqlite' % self.name,
                              encode=cache_encode,
                              decode=cache_decode,
                              autocommit=True)
        logger.info('name=%s size=%s', self.name, len(self._db))

    def close(self):
        self._db.close()

    def __del__(self):
        pass  # self.close()

    def __iter__(self):
        return self._db.__iter__()

    def iteritems(self):
        return self._db.iteritems()

    def iterkeys(self):
        return self._db.iterkeys()

    def itervalues(self):
        return self._db.itervalues()

    def items(self):
        return self.iteritems()

    def __getitem__(self, key):
        return self._db.__getitem__(key)

    def get(self, key, default=None):
        try:
            return self.__getitem__(key)
        except KeyError:
            return default

    def __setitem__(self, key, value):
        return self._db.__setitem__(key, value)

    def set(self, key, value):
        return self.__setitem__(key, value)

    def __len__(self):
        return len(self._db)

    def __contains__(self, key):
        return key in self._db

    def __delitem__(self, key):
        if key in self._db:
            del self._db[key]

    def list(self, limit=None):
        logger.info('list')
        keys = []
        for key in self:
            keys.append(key)
            if limit is not None and len(keys) == limit: break
        return keys