def load_items_from_mongo():
    db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb=schema['db'])
    collection = schema['collection']

    items = getattr(db, collection).find(timeout=False)
    print 'prod mode: 从mongodb加载[%s]里的所有数据' % collection
    return items
Esempio n. 2
0
def load_weibos_from_mongo(limit):
    weibos = []
    mongo = _default_mongo(usedb='master_timeline')
    for weibo in mongo.master_timeline_weibo.find().limit(limit):
        weibos.append(weibo)

    print 'load', len(weibos), 'weibos'
    return weibos
Esempio n. 3
0
def load_weibos_from_mongo(limit):
    weibos = []
    mongo = _default_mongo(usedb='master_timeline')
    for weibo in mongo.master_timeline_weibo.find().limit(limit):
        weibos.append(weibo)

    print 'load', len(weibos), 'weibos'
    return weibos
Esempio n. 4
0
    def __init__(self, dbpath, schema_version, refresh_db=False):
        self.path = dbpath
        self.schema = getattr(Schema, 'v%s' % schema_version)
        self.refresh_db = refresh_db

        self.databases = {}
        self.ts_and_dbfolders = []

        self.mgdb = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb=self.schema['db'])
        self.collection = self.schema['collection']
 def __init__(self, gt, lt):
     self.gt = int(gt)
     self.lt = int(lt)
     if 'pydablooms' in sys.modules.keys():
         self.bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY,
                                          error_rate=DABLOOMS_ERROR_RATE, filepath=DABLOOMS_FILEPATH)
     else:
         self.bloom = None
         host = settings.get('MONGOD_HOST', MONGOD_HOST)
         port = settings.get('MONGOD_PORT', MONGOD_PORT)
         self.db = _default_mongo(host, port, usedb='master_timeline')
 def __init__(self, gt, lt):
     self.gt = int(gt)
     self.lt = int(lt)
     if 'pydablooms' in sys.modules.keys():
         self.bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY,
                                          error_rate=DABLOOMS_ERROR_RATE,
                                          filepath=DABLOOMS_FILEPATH)
     else:
         self.bloom = None
         host = settings.get('MONGOD_HOST', MONGOD_HOST)
         port = settings.get('MONGOD_PORT', MONGOD_PORT)
         self.db = _default_mongo(host, port, usedb='master_timeline')
# -*- coding: utf-8 -*-

import datetime

from utils4scrapy.tk_maintain import _default_mongo

db = _default_mongo(usedb='master_timeline')

count = 0
for weibo in db.master_timeline_weibo.find():
    if 'user' not in weibo:
        print '.'
        count += 1
        print 'del', weibo['_id'], datetime.date.fromtimestamp(weibo['first_in']), count
        db.master_timeline_weibo.remove({'_id': weibo['_id']})
# -*- coding: utf-8 -*-

# 将已有master_timeline的微博加入dablooms的集合

import pydablooms
import time
from utils4scrapy.tk_maintain import _default_mongo

MONGOD_HOST = 'localhost'
MONGOD_PORT = 27017
DABLOOMS_CAPACITY = 2000000000
DABLOOMS_ERROR_RATE = .001
DABLOOMS_FILEPATH = '/opt/scrapy_weibo/scrapy_weibo/bloom.bin'
#DABLOOMS_FILEPATH = '/tmp/bloom.bin'

bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY,
                            error_rate=DABLOOMS_ERROR_RATE,
                            filepath=DABLOOMS_FILEPATH)
db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb='master_timeline')

for status in db.master_timeline_weibo.find():
    bloom.add(status['mid'], int(time.time() * 1000))
# -*- coding: utf-8 -*-

# 将已有master_timeline的微博加入dablooms的集合

import pydablooms
import time
from utils4scrapy.tk_maintain import _default_mongo

MONGOD_HOST = 'localhost'
MONGOD_PORT = 27017
DABLOOMS_CAPACITY = 2000000000
DABLOOMS_ERROR_RATE = .001
DABLOOMS_FILEPATH = '/opt/scrapy_weibo/scrapy_weibo/bloom.bin'
#DABLOOMS_FILEPATH = '/tmp/bloom.bin'

bloom = pydablooms.Dablooms(capacity=DABLOOMS_CAPACITY,
                            error_rate=DABLOOMS_ERROR_RATE, filepath=DABLOOMS_FILEPATH)
db = _default_mongo(MONGOD_HOST, MONGOD_PORT, usedb='master_timeline')

for status in db.master_timeline_weibo.find():
    bloom.add(status['mid'], int(time.time() * 1000))
Esempio n. 10
0
    weibos = db.MGet(weibo_ids)
    # weibos = [msgpack.unpackb(weibo) if weibo else None for weibo in weibos]
    weibos = [json.loads(weibo) if weibo else None for weibo in weibos]
    return weibos


def test_rw(n):
    weibos_from_mongo = load_weibos_from_mongo(n)
    elevator_multi_write(weibos_from_mongo)
    weibo_ids = [str(weibo['id']) for weibo in weibos_from_mongo]
    weibos_from_elevator = elevator_multi_read(weibo_ids)

    for i in xrange(len(weibos_from_mongo)):
        if weibos_from_mongo[i] != weibos_from_elevator[i]:
            print '** ' * 10, i


if __name__ == '__main__':
    mongo = _default_mongo(usedb='master_timeline')
    db = Elevator(timeout=1000)
    db.createdb('testdb')
    db.connect('testdb')
    test_rw(10000)
    db.dropdb('testdb')
    """
    load 100000 weibos
    'load_weibos_from_mongo' args: 7.71 sec
    'elevator_multi_read' args: 14.73 sec
    结论是elevator并不足以投入prod使用
    """
# -*- coding:utf-8 -*-

import sys

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch
from utils4scrapy.tk_maintain import _default_mongo

# 默认schema_version为2
s = XapianSearch(path='../data/', name='master_timeline_weibo')
mongo = _default_mongo(host='219.224.135.60', usedb='master_timeline')

existed_file = open('2011_emotion_users_existed_20130615.txt', 'w')
missing_file = open('2011_emotion_users_missing_20130615.txt', 'w')
with open('/home/arthas/dev/scrapy_weibo/test/2011_emotion_users.txt') as f:
    missing = 0
    not_exist = 0
    per_page_missing = 30
    iter_count = 0
    for line in f:
        iter_count += 1
        if iter_count % 100 == 0:
            print iter_count, missing, not_exist
        uid = line.split()[0]
        uid = int(uid)
        count = s.search(query={'user': uid}, count_only=True)
        r = mongo.master_timeline_user.find_one({'_id': uid})
        if r:
            page = r['statuses_count'] / 100
            if r['statuses_count'] % 100 > 0:
                page += 1