コード例 #1
0
def _load_weibos_from_xapian():
    begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple())
    end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple())

    query_dict = {
        'timestamp': {'$gt': begin_ts, '$lt': end_ts},
    }

    s = XapianSearch(path='/opt/xapian_weibo/data/20130616/', name='master_timeline_weibo')
    count, get_results = s.search(query=query_dict, fields=['_id', 'user', 'text', 'timestamp'])
    print count
    return get_results
コード例 #2
0
def _load_weibos_from_xapian():
    begin_ts = time.mktime(datetime.datetime(2012, 9, 1).timetuple())
    end_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple())

    query_dict = {
        'timestamp': {
            '$gt': begin_ts,
            '$lt': end_ts
        },
    }

    s = XapianSearch(path='/opt/xapian_weibo/data/20130616/',
                     name='master_timeline_weibo')
    count, get_results = s.search(query=query_dict,
                                  fields=['_id', 'user', 'text', 'timestamp'])
    print count
    return get_results
コード例 #3
0
    print r['terms']

print 'hits: %s' % count

stub = '/home/arthas/dev/xapian_weibo/stub/master_timeline_weibo_20130929'
s = XapianSearch(stub=stub, include_remote=True)
count, get_results = s.search(query={'text': [u'中国']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id'])

print 'query2:'

for r in get_results():
    print "** " * 10
    print r['_id']
    print r['user']
    print r['text']
    print r['timestamp']
    print r['terms']

print 'hits: %s' % count
"""

print "query3:"

stub = '/home/arthas/dev/xapian_weibo/stub/master_timeline_weibo_20130929'
s = XapianSearch(stub=stub, include_remote=True)
results = s.iter_all_docs()
count = 0
for r in results:
    count += 1
print 'hits: ', count
コード例 #4
0
ファイル: query_test.py プロジェクト: likeafool/xapian_weibo
# -*- coding:utf-8 -*-

import sys
import time
import datetime

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch
from utils import top_keywords, not_low_freq_keywords

# 默认schema_version为2
s = XapianSearch(path='../data/', name='master_timeline_weibo')

# import和初始化, 请使用下面的用法
# from xapian_weibo.xapian_backend import XapianSearch
# s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo')
# 查询条件有user(id),retweeted_status(id),text,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序)
# 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp
# 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词
# 若fields参数不指定,或者为None,则返回所有字段,除terms之外
# 如果需要返回terms,请一一指定需要的字段,并包括terms
# 简单示例如下
"""
count, get_results = s.search(query={'text': [u'中国'], 'user': 1217743083, 'timestamp': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id'])

print 'query1:'

for r in get_results():
    print "** " * 10
    print r['_id']
    print r['user']
コード例 #5
0
    print r['terms']

print 'hits: %s' % count

stub = '/home/arthas/dev/xapian_weibo/stub/master_timeline_weibo_20130929'
s = XapianSearch(stub=stub, include_remote=True)
count, get_results = s.search(query={'text': [u'中国']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id'])

print 'query2:'

for r in get_results():
    print "** " * 10
    print r['_id']
    print r['user']
    print r['text']
    print r['timestamp']
    print r['terms']

print 'hits: %s' % count
"""

print "query3:"

stub = '/home/arthas/dev/xapian_weibo/stub/master_timeline_weibo_20130929'
s = XapianSearch(stub=stub, include_remote=True)
results = s.iter_all_docs()
count = 0
for r in results:
    count += 1
print 'hits: ', count
コード例 #6
0
# -*- coding:utf-8 -*-

import sys

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch

s = XapianSearch(path='../data/', name='statuses')

query_dict1 = {
    '$and': [{
        'text': '1',
        'uid': '2'
    }],
    '$not': {
        'name': '3',
        'text': '4',
    },
    'name': '5',
}

print s.build_query_tree(query_dict1)
print s.parse_query(query_dict1)

query_dict2 = {
    '$and': [{
        'text': '1',
        'ts': {
            '$gt': 0,
            '$lt': 1
        }
コード例 #7
0
ファイル: query_test.py プロジェクト: movingHera/xapian_weibo
# -*- coding:utf-8 -*-

import sys
import time
import datetime

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch
from utils import top_keywords, not_low_freq_keywords, gen_mset_iter

# 默认schema_version为2
s = XapianSearch(path='/opt/xapian_weibo/data/20131207/', name='master_timeline_weibo')

# import和初始化, 请使用下面的用法
# from xapian_weibo.xapian_backend import XapianSearch
# s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo')
# 查询条件有user(id),retweeted_status(id),text,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序)
# 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp
# 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词
# 若fields参数不指定,或者为None,则返回所有字段,除terms之外
# 如果需要返回terms,请一一指定需要的字段,并包括terms
# 简单示例如下

"""
count, get_results = s.search(query={'text': [u'中国'], 'user': 1217743083, 'timestamp': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id'])

print 'query1:'

for r in get_results():
    print "** " * 10
    print r['_id']
コード例 #8
0
ファイル: query_test.py プロジェクト: bleachyin/xapian_weibo
# -*- coding:utf-8 -*-

import sys
import  calendar
import datetime

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch

s = XapianSearch(path='../data/', name='statuses')

results = s.search(query={'text': [u'中国'], 'uid': 1217743083, 'ts': {'$gt': 0, '$lt': 1334450340}}, sort_by=['-ts'], fields=['text', 'ts', 'name'])

print 'query1:'

for r in results['results']:
    print r['ts']

print 'hits: %s' % results['hits']

print 'query2:'
query_dict = {'$and': [{'text': [u'中国'], 'uid': 1217743083},
                       {'uid': 1217743083},
                       {'$or': [{'ts': {'gt': 0,
                                      'lt': 1334450340}},
                                {'uid': 0000000000}]}],
              '$not': {'text': u'宝马', 'name': u'白之兔'},
              'name': u'袁岳'
              }

results = s.search(query=query_dict, sort_by=['-ts'], fields=['text', 'ts'])
コード例 #9
0
# -*- coding:utf-8 -*-

import sys

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch

s = XapianSearch(path='../data/', name='statuses')

query_dict1 = {
    '$and': [{'text': '1',
              'uid': '2'}],
    '$not': {
        'name': '3',
        'text': '4',
    },
    'name': '5',
}

print s.build_query_tree(query_dict1)
print s.parse_query(query_dict1)

query_dict2 = {
    '$and': [{'text': '1', 'ts': {'$gt': 0, '$lt': 1}},
             {'$or': [{'uid': 3},
                      {'uid': 4}]}],
    '$not': {
        'name': '3',
        'text': '4',
    },
    'name': '5',
コード例 #10
0
# -*- coding:utf-8 -*-

import sys

sys.path.append('../xapian_weibo')
from xapian_backend import XapianSearch
from utils4scrapy.tk_maintain import _default_mongo

# 默认schema_version为2
s = XapianSearch(path='../data/', name='master_timeline_weibo')
mongo = _default_mongo(host='219.224.135.60', usedb='master_timeline')

existed_file = open('2011_emotion_users_existed_20130615.txt', 'w')
missing_file = open('2011_emotion_users_missing_20130615.txt', 'w')
with open('/home/arthas/dev/scrapy_weibo/test/2011_emotion_users.txt') as f:
    missing = 0
    not_exist = 0
    per_page_missing = 30
    iter_count = 0
    for line in f:
        iter_count += 1
        if iter_count % 100 == 0:
            print iter_count, missing, not_exist
        uid = line.split()[0]
        uid = int(uid)
        count = s.search(query={'user': uid}, count_only=True)
        r = mongo.master_timeline_user.find_one({'_id': uid})
        if r:
            page = r['statuses_count'] / 100
            if r['statuses_count'] % 100 > 0:
                page += 1