def load_items_from_mongo(): db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb=schema['db']) collection = schema['collection'] items = getattr(db, collection).find(timeout=False) print 'prod mode: 从mongodb加载[%s]里的所有数据' % collection return items
def load_weibos_from_mongo(limit): weibos = [] mongo = _default_mongo(usedb='master_timeline') for weibo in mongo.master_timeline_weibo.find().limit(limit): weibos.append(weibo) print 'load', len(weibos), 'weibos' return weibos
def __init__(self, dbpath, schema_version, refresh_db=False): self.path = dbpath self.schema = getattr(Schema, 'v%s' % schema_version) self.refresh_db = refresh_db self.databases = {} self.ts_and_dbfolders = [] self.mgdb = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb=self.schema['db']) self.collection = self.schema['collection']
def __init__(self, gt, lt): self.gt = int(gt) self.lt = int(lt) if 'pydablooms' in sys.modules.keys(): self.bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY, error_rate=DABLOOMS_ERROR_RATE, filepath=DABLOOMS_FILEPATH) else: self.bloom = None host = settings.get('MONGOD_HOST', MONGOD_HOST) port = settings.get('MONGOD_PORT', MONGOD_PORT) self.db = _default_mongo(host, port, usedb='master_timeline')
# -*- coding: utf-8 -*- import datetime from utils4scrapy.tk_maintain import _default_mongo db = _default_mongo(usedb='master_timeline') count = 0 for weibo in db.master_timeline_weibo.find(): if 'user' not in weibo: print '.' count += 1 print 'del', weibo['_id'], datetime.date.fromtimestamp(weibo['first_in']), count db.master_timeline_weibo.remove({'_id': weibo['_id']})
# -*- coding: utf-8 -*- # 将已有master_timeline的微博加入dablooms的集合 import pydablooms import time from utils4scrapy.tk_maintain import _default_mongo MONGOD_HOST = 'localhost' MONGOD_PORT = 27017 DABLOOMS_CAPACITY = 2000000000 DABLOOMS_ERROR_RATE = .001 DABLOOMS_FILEPATH = '/opt/scrapy_weibo/scrapy_weibo/bloom.bin' #DABLOOMS_FILEPATH = '/tmp/bloom.bin' bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY, error_rate=DABLOOMS_ERROR_RATE, filepath=DABLOOMS_FILEPATH) db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb='master_timeline') for status in db.master_timeline_weibo.find(): bloom.add(status['mid'], int(time.time() * 1000))
weibos = db.MGet(weibo_ids) # weibos = [msgpack.unpackb(weibo) if weibo else None for weibo in weibos] weibos = [json.loads(weibo) if weibo else None for weibo in weibos] return weibos def test_rw(n): weibos_from_mongo = load_weibos_from_mongo(n) elevator_multi_write(weibos_from_mongo) weibo_ids = [str(weibo['id']) for weibo in weibos_from_mongo] weibos_from_elevator = elevator_multi_read(weibo_ids) for i in xrange(len(weibos_from_mongo)): if weibos_from_mongo[i] != weibos_from_elevator[i]: print '** ' * 10, i if __name__ == '__main__': mongo = _default_mongo(usedb='master_timeline') db = Elevator(timeout=1000) db.createdb('testdb') db.connect('testdb') test_rw(10000) db.dropdb('testdb') """ load 100000 weibos 'load_weibos_from_mongo' args: 7.71 sec 'elevator_multi_read' args: 14.73 sec 结论是elevator并不足以投入prod使用 """
# -*- coding:utf-8 -*- import sys sys.path.append('../xapian_weibo') from xapian_backend import XapianSearch from utils4scrapy.tk_maintain import _default_mongo # 默认schema_version为2 s = XapianSearch(path='../data/', name='master_timeline_weibo') mongo = _default_mongo(host='219.224.135.60', usedb='master_timeline') existed_file = open('2011_emotion_users_existed_20130615.txt', 'w') missing_file = open('2011_emotion_users_missing_20130615.txt', 'w') with open('/home/arthas/dev/scrapy_weibo/test/2011_emotion_users.txt') as f: missing = 0 not_exist = 0 per_page_missing = 30 iter_count = 0 for line in f: iter_count += 1 if iter_count % 100 == 0: print iter_count, missing, not_exist uid = line.split()[0] uid = int(uid) count = s.search(query={'user': uid}, count_only=True) r = mongo.master_timeline_user.find_one({'_id': uid}) if r: page = r['statuses_count'] / 100 if r['statuses_count'] % 100 > 0: page += 1