def setUp(self): self.weibo_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.user_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_user', schema_version=1) self.begin_ts = time.mktime(datetime.datetime(2011, 12, 1).timetuple()) self.end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple())
def init_xapian(): return XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline')
REDIS_PORT = 6379 SQLALCHEMY_DATABASE_URI = 'mysql+mysqldb://root:@localhost/weibo?charset=utf8' DYNAMIC_XAPIAN_WEIBO_STUB_PATH = '/home/mirage/dev/data/stub/master_timeline_weibo_' elif IS_PROD == 3: XAPIAN_WEIBO_DATA_PATH = '/home/ubuntu3/huxiaoqian/case/20140724/20140804/' XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/' XAPIAN_DOMAIN_DATA_PATH = '/opt/xapian_weibo/data/20131130/' #无 MASTER_TIMELINE_STUB = '/home/mirage/dev/data/stub/master_timeline_weibo_stub' #无 LEVELDBPATH = '/home/ubuntu3/huxiaoqian/case_test/data/leveldbpath/' # 无 REDIS_HOST = '219.224.135.49' #索引的redis服务器为49,应该用不到 REDIS_PORT = 6379 MONGODB_HOST = '219.224.135.47' MONGODB_PORT = 27019 SQLALCHEMY_DATABASE_URI = 'mysql+mysqldb://root:@219.224.134.222/weibocase?charset=utf8' DYNAMIC_XAPIAN_WEIBO_STUB_PATH = '/home/ubuntu4/ljh/csv/stub/topic/master_timeline_weibo_topic' xapian_search_user = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1) # Create application app = Flask('xxx') # Create dummy secrey key so we can use sessions app.config['SECRET_KEY'] = 'A0Zr98j/3yX R~XHH!jmN]LWX/,?RT' # Create database app.config['SQLALCHEMY_DATABASE_URI'] = SQLALCHEMY_DATABASE_URI app.config['SQLALCHEMY_ECHO'] = False db = SQLAlchemy(app)
# -*- coding:utf-8 -*- import time import datetime from xapian_weibo.xapian_backend import XapianSearch s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) begin_ts = time.mktime(datetime.datetime(2011, 1, 1).timetuple()) end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple()) """ query_dict = { 'created_at': { '$gt': begin_ts, '$lt': end_ts, } } count, get_results = s.search(query=query_dict, max_offset=1, fields=['_id', 'name'], sort_by=['created_at']) print count for r in get_results(): print r['_id'], r['name'] """ """ query_dict = { '$or': [ {'_id': 1934744637},
def setUp(self): self.n = 10000 self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo') self.weibo_ids = self._load_weibo_ids_from_xapian(self.n)
# -*- coding: utf-8 -*- import pymongo, time, codecs, datetime try: from xapian_weibo.xapian_backend import XapianSearch statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2) except: pass def con_database(): DB_HOST = '219.224.135.60' DB_PORT = 27017 DB_USER = '******' DB_PWD = 'root' connection = pymongo.Connection(DB_HOST, DB_PORT) db = connection.admin db.authenticate(DB_USER, DB_PWD) return connection.test_crawler_liwenwen def main(uid, startdate, enddate): startts = date2ts(startdate) endts = date2ts(enddate) db = con_database() print db.users.find({'uid': str(uid), 'ts':{'$gte': startts, '$lte': endts}}).count() cursor = db.users.find({'uid': str(uid), 'ts':{'$gte': startts, '$lte': endts}}) for weibo in cursor: print weibo def date2ts(date): return int(time.mktime(time.strptime(date, '%Y-%m-%d')))
import os from xapian_weibo.xapian_backend import XapianSearch from operator import itemgetter import datetime import time import leveldb LEVELDBPATH = '/home/mirage/leveldb' global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field_20131012'), block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25)) xapian_search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo') # search by index xapian_search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1) # search by index fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology'] def readProtoUser(): protou = {} with open("/home/mirage/linhao/project_bishe/weibo/profile/user_classify/protou.txt") as f: for line in f: area = line.split(":")[0] if area not in protou: protou[area] = set() for u in line.split(":")[1].split(): protou[area].add(int(u)) return protou