Exemple #1
0
def _persist_v0(file_path, zg):
    print 'Creating db...'
    persisted = SqliteDict(file_path, autocommit=False)
    print 'Updating data...'
    persisted.update(zg.country_postal_codes)
    print 'Committing data...'
    persisted.commit()
Exemple #2
0
def main(data_dir):
    print 'Loading data...'
    zg = Zipgun(data_dir, force_text=True)
    print 'Creating db...'
    persisted = SqliteDict(os.path.join(data_dir, DATA_FILE), autocommit=False)
    print 'Updating data...'
    persisted.update(zg.country_postal_codes)
    print 'Committing data...'
    persisted.commit()
Exemple #3
0
def _persist_v1(file_path, zg):
    print 'Creating meta db...'
    zipgun_info = SqliteDict(
        file_path, tablename='zipgun_info', autocommit=False)
    zipgun_info['version'] = 1
    zipgun_info['country_codes'] = zg.country_postal_codes.keys()
    zipgun_info.commit()

    for country_code in zg.country_postal_codes:
        print 'Creating {} db...'.format(country_code)
        country_data = SqliteDict(
            file_path, tablename='zg_{}'.format(country_code),
            autocommit=False)
        country_data.update(zg.country_postal_codes[country_code])
        country_data.commit()
        time.sleep(1.0)                   # Pretty bullshit
        country_data.close()
    zipgun_info.close()
Exemple #4
0
def reset(texts, index_dic=True, tfidf=True, hdp=False, lda=True, sim=False):
    total_start = timeit.default_timer()
    make_index_time = 0
    make_dict_time = 0
    make_lda_time = 0
    make_tfidf_time = 0
    sim_time = 0
    hdptopicnum = 0

    if index_dic:
        f = [i.split(',') for i in texts.readlines()]
        logging.info('Create id & ac_id list')
        ids = [f[i][1] for i in range(len(f))]
        ac_ids = [f[i][0] for i in range(len(f))]
        logging.info('Create contents list')
        contents = []
        for i in range(len(f)):
            if len(f[i]) == 3:
                contents.append(f[i][2].strip().split(':'))
            else:
                contents.append([])

        # make index
        logging.info('***********Now Make Index by sqlitedict***********')
        timer_start = timeit.default_timer()
        pos2paid = zip(range(len(f)), ac_ids)
        paid2pos_rel = {}
        for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)):
            paid2pos_rel.update({int(key): [i[0] for i in paid]})
        id2pos_rel = dict(zip(ids, range(len(f))))
        pos2id_rel = dict(zip(range(len(f)), ids))

        id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True)
        id2pos.clear()
        id2pos.update(id2pos_rel)
        id2pos.close()
        pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True)
        pos2id.clear()
        pos2id.update(pos2id_rel)
        pos2id.close()
        paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True)
        paid2pos.clear()
        paid2pos.update(paid2pos_rel)
        paid2pos.close()
        timer_end = timeit.default_timer()
        make_index_time = timer_end - timer_start

        # make dict
        logging.info('***********Now Make Dictionary***********')
        timer_start = timeit.default_timer()
        dic = corpora.Dictionary(contents)
        ############## optimized dictionary
        dic.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        ##############
        dic.save(gl.res + '/resource/dict')
        timer_end = timeit.default_timer()
        make_dict_time = timer_end - timer_start

        # make corpus
        logging.info('***********Now Make Corpus***********')

        temps = []
        for i, t in enumerate(contents):
            temps.append(dic.doc2bow(t))
            if i % 10000 == 0:
                logging.info('make corpus ' + str(i) + ' articles')
        corpus = temps
        corpora.MmCorpus.serialize(gl.res + '/resource/corpus', corpus)

    if tfidf:
        # do tfidf train
        logging.info('***********Now Training TF-IDF Model***********')
        timer_start = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        tfidf = models.TfidfModel(corpus)
        tfidf.save(gl.res + '/resource/tfidf')

        timer_end = timeit.default_timer()
        make_tfidf_time = timer_end - timer_start

    if hdp:
        gc.collect()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        hdpmodel = models.hdpmodel.HdpModel(corpus, id2word=dic)
        hdptopicnum = len(hdpmodel.print_topics(topics=-1, topn=10))
        logging.info('hdptopicnum is {}'.format(hdptopicnum))

    if lda:
        # do lda train
        gc.collect()
        tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf')
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        corpus_tfidf = tfidf[corpus]
        logging.info('***********Now Training LDA Model***********')
        timer_start = timeit.default_timer()
        if not hdptopicnum == 0:
            gl.topicCount = hdptopicnum
        lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
                                  num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes)
        # lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
        #                       num_topics=gl.topicCount, passes=gl.lda_passes, distributed=True)
        lda.save(gl.res + '/resource/lda')
        timer_end = timeit.default_timer()
        make_lda_time = timer_end - timer_start
        logging.info('lda training cost %.2f seconds' % make_lda_time)

    if sim:
        gc.collect()
        logging.info('***********Now Make Similarity Index***********')
        st = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        lda = models.LdaModel.load(gl.res + '/resource/lda')
        index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount)
        index.save(gl.res + '/resource/simIndex')
        sim_time = timeit.default_timer() - st

    total_end = timeit.default_timer()
    total_time = total_end - total_start
    m = divmod(total_time, 60)
    h = divmod(m[0], 60)
    logging.info('\nReset LDA Model complete!!!\n'
                 '***Using time*** \n'
                 'index training    {:.2f}\n'
                 'dict training     {:.2f}\n'
                 'tfidf training    {:.2f}\n'
                 'lda training      {:.2f}\n'
                 'sim training      {:.2f}\n'
                 'Total time:       {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time,
                                                                make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))

    basicConfig = open(gl.res + '/resource/basicConfig.txt', mode='w+')
    basicConfig.write('FileName: {}'
                      '\nTopicNumber = {}'
                      '\nestTopicNumber = {}'
                      '\nldaPasses = {}'
                      .format(os.path.basename(texts.name), gl.topicCount, hdptopicnum, gl.lda_passes))
    basicConfig.close()
Exemple #5
0
def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False):
    total_start = timeit.default_timer()
    make_index_time = 0
    make_dict_time = 0
    make_lda_time = 0
    make_tfidf_time = 0
    sim_time = 0

    if index_dic:
        f = [i.split(',') for i in texts.readlines()]
        logging.info('Create id & ac_id list')
        ids = [f[i][0] for i in range(len(f))]
        ac_ids = [f[i][1] for i in range(len(f))]
        logging.info('Create contents list')
        contents = []
        for i in range(len(f)):
            if len(f[i]) == 3:
                contents.append(f[i][2].strip().split(':'))
            else:
                contents.append([])

        # make index
        logging.info('***********Now merge index by sqlitedict***********')
        timer_start = timeit.default_timer()
        old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus'))
        pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids)
        paid2pos_new = {}
        for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)):
            paid2pos_new.update({int(key): [i[0] for i in paid]})
        id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f))))
        pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids))

        id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True)
        id2pos.update(id2pos_new)
        id2pos.close()
        pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True)
        pos2id.update(pos2id_new)
        pos2id.close()
        paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True)
        x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])]
        for i in list(set.intersection(*x)):  # update duplicate key
            temp = list(chain(paid2pos[i], paid2pos_new[i]))
            paid2pos.update({int(i): temp})
        paid2pos.close()
        timer_end = timeit.default_timer()
        make_index_time = timer_end - timer_start

        # Merge dictionary
        logging.info('***********Now merge Dictionary***********')
        timer_start = timeit.default_timer()
        newDict = corpora.Dictionary(contents)
        newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        dic.merge_with(newDict)
        dic.save(gl.res + '/resource/dict')
        timer_end = timeit.default_timer()
        make_dict_time = timer_end - timer_start

        # merge corpus
        logging.info('***********Now merge Corpus***********')
        temps = []
        for i, t in enumerate(contents):
            temps.append(dic.doc2bow(t))
            if i % 10000 == 0:
                logging.info('make corpus ' + str(i) + ' articles')
        corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps)
        gc.collect()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c')
        merged_corpus = chain(corpus, new_corpus)
        corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus)  # Overwrite corpus

        for filename in glob.glob(gl.res + '/resource/*'):
            if filename.endswith('corpus') or filename.endswith('corpus.index') \
                    or filename.endswith('new_c') or filename.endswith('new_c.index'):  # rm useless corpus
                # os.remove(filename)
                os.unlink(filename)
            if filename.endswith('merged_c'):  # rename to corpus
                os.rename(filename, gl.res + '/resource/corpus')
            if filename.endswith('merged_c.index'):
                os.rename(filename, gl.res + '/resource/corpus.index')

    if tfidf:
        # do tfidf merge
        gc.collect()
        logging.info('***********Now merge TF-IDF model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('tfidf'):
                os.rename(filename, filename + '_' + gl.c_time)
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')  # reload corpus
        tfidf = models.TfidfModel(corpus)
        tfidf.save(gl.res + '/resource/tfidf')
        timer_end = timeit.default_timer()
        make_tfidf_time = timer_end - timer_start

    if lda:
        # do lda merge
        gc.collect()
        tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf')
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        corpus_tfidf = tfidf[corpus]
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        logging.info('***********Now merge LDA model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('lda') or filename.endswith('lda.state'):
                os.rename(filename, filename + '_' + gl.c_time)
        # lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
        #                           num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes)
        lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
                              num_topics=gl.topicCount, passes=gl.lda_passes)
        lda.save(gl.res + '/resource/lda')
        timer_end = timeit.default_timer()
        make_lda_time = timer_end - timer_start
        logging.info('lda training cost %.2f seconds' % make_lda_time)

    if sim:
        gc.collect()
        logging.info('***********Now Make Similarity Index***********')
        st = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        lda = models.LdaModel.load(gl.res + '/resource/lda')
        index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount)
        index.save(gl.res + '/resource/simIndex')
        sim_time = timeit.default_timer() - st

    total_end = timeit.default_timer()
    total_time = total_end - total_start
    m = divmod(total_time, 60)
    h = divmod(m[0], 60)
    logging.info('\nMerge LDA Model complete!!!\n'
                 '***Using time*** \n'
                 'index training    {:.2f}\n'
                 'dict training     {:.2f}\n'
                 'tfidf training    {:.2f}\n'
                 'lda training      {:.2f}\n'
                 'sim training      {:.2f}\n'
                 'Total time:       {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time,
                                                                make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))
    return now.date()


if __name__ == '__main__':
    update_stock_code()
    stock_codes = get_all_code()
    agency_db = SqliteDict(Path('..') / 'db' / 'agency_db.sqlite',
                           autocommit=True)

    for stock_code in tqdm(stock_codes):
        today = get_businessday()

        if stock_code not in agency_db:
            agency_db.update({
                stock_code: {
                    'agency_meta': pd.DataFrame(),
                    'agency_detail': pd.DataFrame()
                }
            })

        is_not_duplicated = today not in agency_db[stock_code][
            'agency_detail'].index
        if is_not_duplicated:
            agency_volume_meta, agency_volume_detail = crawl_agency_volume(
                stock_code)
            if not agency_volume_meta.empty:
                agency_volume_meta_ = update_diff_only(
                    agency_db[stock_code]['agency_meta'], agency_volume_meta)
                agency_volume_detail_ = update_diff_only(
                    agency_db[stock_code]['agency_detail'],
                    agency_volume_detail)
                agency_db.update({
def main(result_file, site_file, constant_modification_list=None, variable_modification_list=None,
         enzyme_info=None, n_processes=4, output_file=None):
    if output_file is None:
        # output_file = os.path.splitext(result_file)[0] + '.theoretical_ions'
        output_file = os.path.splitext(result_file)[0] + ".db"
    else:
        output_file += ".db"
    modification_table = RestrictedModificationTable.bootstrap(constant_modification_list, variable_modification_list)
    if constant_modification_list is None and variable_modification_list is None:
        modification_table = ModificationTable.bootstrap()

    if isinstance(site_file, basestring):
        site_list = [line.strip() for line in open(site_file, "r")]
        site_list = list(map(int, site_list))
    else:
        site_list = site_file

    compo_dict = csv.DictReader(open(result_file, "r"), delimiter=",")
    colnames = compo_dict.fieldnames
    glycan_identity = get_glycan_identities(colnames)
    enzyme_info = map(get_enzyme, enzyme_info)
    tag = datetime.datetime.strftime(datetime.datetime.now(), "%Y%m%d-%H%M%S")
    metadata = {
        "glycan_identities": glycan_identity,
        "constant_modifications": constant_modification_list,
        "variable_modifications": variable_modification_list,
        "site_list": site_list,
        "ms1_output_file": result_file,
        "enzyme": enzyme_info,
        "tag": tag,
        "enable_partial_hexnac_match": constants.PARTIAL_HEXNAC_LOSS
    }

    metadata_store = SqliteDict(output_file, tablename="metadata", flag='n')
    metadata_store.update(metadata)
    metadata_store.commit()

    theoretical_search_space_store = SqliteDict(output_file, tablename="theoretical_search_space")
    pool = multiprocessing.Pool(n_processes)

    task_fn = functools.partial(process_predicted_ms1_ion, modification_table=modification_table,
                                site_list=site_list, glycan_identity=glycan_identity)

    cntr = 0
    if n_processes > 1:
        logger.debug("Building theoretical sequences concurrently")
        for res in (itertools.chain.from_iterable(pool.imap(task_fn, compo_dict, chunksize=500))):
            theoretical_search_space_store[cntr] = res
            cntr += 1
    else:
        logger.debug("Building theoretical sequences sequentially")
        for row in compo_dict:
            res = task_fn(row)
            for item in res:
                theoretical_search_space_store[cntr] = item
                cntr += 1
                if (cntr % 10000) == 0:
                    theoretical_search_space_store.commit()
                    logger.info("Committing, %d records made", cntr)
    theoretical_search_space_store.commit()
    theoretical_search_space_store.close()

    pool.close()
    pool.join()
    pool.terminate()

    logger.info("Hypothesis building complete")

    return output_file
	

start_68 = datetime.datetime(year=2016, month=4, day=22)
end_68 = datetime.datetime(year=2016, month=5, day=2)

timestamp_68_start = unix_time_millis(start_68)
timestamp_68_end = unix_time_millis(end_68)

done_players = 0

if os.path.isfile("games.sqlite"):
	#with open("game_data.pkl", "rb") as fp:
	player_tiers = SqliteDict("player_tiers.sqlite", autocommit=True)
	done_games = SqliteDict("games.sqlite", autocommit=True)
	oldie = players.qsize()
	player_tiers.update({x['playerOrTeamId']:1 for x in masters['entries']})
	players = Queue.PriorityQueue()
	a = [players.put((player_tiers[x], x)) for x in player_tiers]
	print "Queue length has been modified from %d to %d" % (oldie, players.qsize())
else:
	player_tiers = SqliteDict("player_tiers.sqlite")
	player_tiers.update({x['playerOrTeamId']:0 for x in challenger['entries']})
	done_games = SqliteDict("games.sqlite")

gold=False

#for i in range(2090): players.get(); done_players += 1 # 19942923

while not players.empty():
	curr = players.get()
	if curr[0] > 3 and not gold: done_games = SqliteDict("games_gold.sqlite");gold=True
Exemple #9
0
class Bucket(object):

    def __init__(self, bucket_name, storage_path=None):
        ''' Bucker init

        - if the bucket exists, meta parameter will be ignored

        '''
        if bucket_name and isinstance(bucket_name, (str, unicode)) and re.match(r"^[a-z0-9\.\-_]+$", bucket_name, re.I):
            self._name = bucket_name.strip()
        else:
            raise falcon.HTTPInvalidParam(
                "The parameter shall contain only alpha-numeric characters, value: '%s'" % bucket_name, 
                param_name='name'
            )

        self._bucket_path = None
        if storage_path and os.path.exists(storage_path):
            self._bucket_path = os.path.join(storage_path, self._name)
        else:
            raise falcon.HTTPInternalServerError(
                title='IncorrectStoragePath',
                description='The storage path is incorrect, "%s"' % storage_path
            )

        if self._bucket_path and os.path.exists(self._bucket_path):
            self._meta = SqliteDict(os.path.join(self._bucket_path,'metadata.sqlite'), 'bucket', autocommit=True)
        else:
            self._meta = SqliteDict(':memory:', 'bucket', autocommit=True)

    @property
    def bucket_path(self):

        return self._bucket_path

    
    @property
    def metadata(self):
        
        return dict(self._meta)
    

    @metadata.setter
    def metadata(self, value):

        if value and isinstance(value, dict):
            self._meta.update(value)
        else:
            raise RuntimeError('Incorrect metadata type. Found "%s", expected "dict"' % type(value))


    def exists(self):
        ''' check if the bucket exists
        ''' 
        if self.bucket_path and os.path.exists(self.bucket_path):       
            return True
        else:
            return False


    def create(self):
        ''' create new bucket
        '''
        if self.exists():
            raise falcon.HTTPConflict(
                title='BucketAlreadyExists',
                description="The requested bucket name '%s' is not available. Please select a different name and try again." % self._name
            )

        # prepare bucket directory
        try:
            os.makedirs(self.bucket_path)
            os.makedirs(os.path.join(self.bucket_path, 'data'))
            os.makedirs(os.path.join(self.bucket_path, 'tmp'))
        except IOError, err:
            raise falcon.HTTPInternalServerError(
                title='BucketCreationError',
                description='The path to bucket cannot be created, "%s"' % self.bucket_path
            )

        # create metadata file in bucket directory
        _meta = self._meta
        self._meta = SqliteDict(os.path.join(self.bucket_path, 'metadata.sqlite'), 'bucket', autocommit=True) 
        self._meta.update(_meta)