Ejemplo n.º 1
0
def whole_followers_task(top_n, date, window_size):
    user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)
    count, get_results = user_search.search(query={'followers_count': {'$gt': FOLLOWERS_MIN_SUPPORT}}, sort_by=['-followers_count'], fields=['_id'], max_offset=top_n)
    sorted_uids = []
    for user in get_results():
        sorted_uids.append(user['_id'])
    return sorted_uids
Ejemplo n.º 2
0
def load_friendship_to_leveldb():
    from xapian_weibo.xapian_backend import XapianSearch
    s_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)
    
    query_dict = {
        '_id': {
            '$gt': 0,
        }
    }
    count, get_results = s_user.search(query=query_dict, fields=['_id', 'friends', 'followers'])
    print count

    count = 0
    ts = te = time.time()
    for r in get_results():
        uid = r['_id']
        friends = r['friends']
        followers = r['followers']
        if friends and len(friends):
            k = str(uid) + '_' + 'friends'
            v = json.dumps(friends)
            friendship_bucket.Put(k, str(v))
        if followers and len(followers):
            k = str(uid) + '_' + 'followers'
            v = json.dumps(followers)
            friendship_bucket.Put(k, str(v))
        count += 1
        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts)
            ts = te
Ejemplo n.º 3
0
def get_user(uid):
    user = {}
    s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user')
    count,get_results = s.search(query={'_id': uid})
    for r in get_results():
        user['id'] = r['_id']
        user['province'] = r['province']
        user['bi_followers_count'] = r['bi_followers_count']
        user['verified'] = r['verified']
        user['description'] = r['description'].decode("utf-8")
        user['friends_count'] = r['friends_count']
        user['city'] = r['city']
        user['gender']  = r['gender']
        user['profile_image_url'] = r['profile_image_url']
        user['verified_reason'] = r['verified_reason'].decode("utf-8")
        user['followers_count'] = r['followers_count']
        user['location'] = r['location'].decode("utf-8")
        user['active'] = r['active']
        user['statuses_count'] = r['statuses_count']
        if r['name']:
            user['name'] = r['name'].decode("utf-8")
        else:
            user['name'] = u'未知用户'
        user['userField'] = u'未知领域'
        break
    if user == {}:
        return None
    else:
        return user
Ejemplo n.º 4
0
def make_network(topic, date, window_size, max_size=100000, ts=False):
    end_time = datetime2ts(date)
    start_time = end_time - window2time(window_size)

    g = nx.DiGraph()

    #need repost index
    topic = cut(s, topic.encode('utf-8'))
    statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
    query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}}

    if ts:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size)
    else:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size)
    print 'topic statuses count %s' % count

    if ts:
        uid_ts = {}
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    repost_ts = int(status['timestamp'])
                    source_status = acquire_status_by_id(rt_mid)
                    source_uid = source_status['user']
                    source_ts = int(source_status['timestamp'])
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    if repost_uid not in uid_ts:
                        uid_ts[repost_uid] = repost_ts
                    else:
                        if uid_ts[repost_uid] > repost_ts:
                            uid_ts[repost_uid] = repost_ts
                    if source_uid not in uid_ts:
                        uid_ts[source_uid] = source_ts   
                    else:
                        if uid_ts[source_uid] > source_ts:
                            uid_ts[source_uid] = source_ts
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return uid_ts, g
    else:
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    source_uid = acquire_status_by_id(rt_mid)['user']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return g
Ejemplo n.º 5
0
def search_test(date):
    start_ts = datetime2ts(date)
    end_ts = start_ts + 24*60*60
    statuses_search = XapianSearch(path=XAPIAN_STATUSES_PATH, name='master_timeline_weibo', schema_version=2)
    query_dict = {'timestamp': {'$gt': start_ts, '$lt': end_ts}}
    statuses_count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', '_id', 'retweeted_status'])
    count = 0
    start_time = time.time()
    for status in get_statuses_results():
        count += 1
    print 'total statuses %s' % count
def search_single():
    search_start_ts = time.time()
    xapian_weibo = XapianSearch(stub=stub_files, schema_version=5)
    count, get_results = xapian_weibo.search(query={"text": keywords_arg}, fields=xapian_fields)
    count = 0
    search_end_ts = time.time()
    print "search single %d" % (search_end_ts - search_start_ts)
    tb = time.time()
    ts = tb
    for r in get_results():
        if count % 10000 == 0:
            te = time.time()
            print "[%s] read speed: %s sec/per %s" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), te - ts, 10000)
            ts = te
        count += 1
    print count
def make(date, hour):
    ts = datetime2ts(date)
    start_ts = datetime2ts(date) + (hour-1)*60*60
    end_ts = start_ts + hour*60*60

    db_name = get_leveldb(ts, hour)
        
    hourly_user_burst_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name),
                                                    block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

    statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)

    user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)

    batch = leveldb.WriteBatch()

    query_dict = {'timestamp': {'$gt': start_ts, '$lt': end_ts}, 'reposts_count': {'$gt': 100}}

    statuses_count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', '_id', 'reposts_count'])

    print 'total statuses: %s' % statuses_count

    print 'writing to levelDB %s...' % db_name

    batch.Put('size', str(statuses_count))

    count = 0
    uid_burst = {}
    for status in get_statuses_results():
        if count % 10000 == 0:
            print 'current count: %s' % count
        uid = status['user']
        reposts_count = status['reposts_count']
        followers_count = 0
        if uid not in uid_burst:
            uid_burst[uid] = 0
        reposts_count += uid_burst[uid]
        uid_burst[uid] = reposts_count
        batch.Put(str(uid), str(reposts_count))
        count += 1

    hourly_user_burst_bucket.Write(batch, sync=True)

    print 'done.'
def test_search(stub_file):
    search_start_ts = time.time()
    xapian_weibo = XapianSearch(stub=stub_file, schema_version=5)
    count, get_results = xapian_weibo.search(query={"text": keywords_arg}, fields=xapian_fields)
    pid_num = os.getpid()
    search_end_ts = time.time()
    print "Working in Process #%d, %d, search uses %d seconds" % (pid_num, count, search_end_ts - search_start_ts)

    fw = open("./cache/%s.txt" % pid_num, "w")
    count = 0
    tb = time.time()
    ts = tb
    for r in get_results():
        fw.write("%s\n" % json.dumps(r))

        if count % 10000 == 0:
            te = time.time()
            print "process [%s] write speed: %s sec/per %s" % (pid_num, te - ts, 10000)
            ts = te
        count += 1
    fw.close()

    return pid_num
 def setUp(self):
     self.n = 10000
     self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo')
     self.weibo_ids = self._load_weibo_ids_from_xapian(self.n)
class BenchXapianGetByIds(hurdles.BenchCase):
    def setUp(self):
        self.n = 10000
        self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo')
        self.weibo_ids = self._load_weibo_ids_from_xapian(self.n)

    def tearDown(self):
        pass

    def _load_weibo_ids_from_xapian(self, limit):
        begin_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple())
        end_ts = time.mktime(datetime.datetime(2013, 1, 2).timetuple())

        query_dict = {
            'timestamp': {'$gt': begin_ts, '$lt': end_ts},
        }
        count, get_results = self.s.search(query=query_dict, max_offset=limit, fields=['_id'])
        print count
        ids = []
        for r in get_results():
            ids.append(r['_id'])

        return ids

    def bench_1(self):
        for _id in self.weibo_ids:
            query_dict = {'_id': _id}
            count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])

    def bench_10(self):
        size = 10
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size: (i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])

    def bench_20(self):
        size = 20
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size: (i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])

    def bench_30(self):
        size = 30
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size: (i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])

    def bench_50(self):
        size = 50
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size: (i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict, fields=['_id', 'text'])
Ejemplo n.º 11
0
class BenchXapianGetByIds(hurdles.BenchCase):
    def setUp(self):
        self.n = 10000
        self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/',
                              name='master_timeline_weibo')
        self.weibo_ids = self._load_weibo_ids_from_xapian(self.n)

    def tearDown(self):
        pass

    def _load_weibo_ids_from_xapian(self, limit):
        begin_ts = time.mktime(datetime.datetime(2013, 1, 1).timetuple())
        end_ts = time.mktime(datetime.datetime(2013, 1, 2).timetuple())

        query_dict = {
            'timestamp': {
                '$gt': begin_ts,
                '$lt': end_ts
            },
        }
        count, get_results = self.s.search(query=query_dict,
                                           max_offset=limit,
                                           fields=['_id'])
        print count
        ids = []
        for r in get_results():
            ids.append(r['_id'])

        return ids

    def bench_1(self):
        for _id in self.weibo_ids:
            query_dict = {'_id': _id}
            count, get_results = self.s.search(query=query_dict,
                                               fields=['_id', 'text'])

    def bench_10(self):
        size = 10
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size:(i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict,
                                               fields=['_id', 'text'])

    def bench_20(self):
        size = 20
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size:(i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict,
                                               fields=['_id', 'text'])

    def bench_30(self):
        size = 30
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size:(i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict,
                                               fields=['_id', 'text'])

    def bench_50(self):
        size = 50
        for i in xrange(self.n / size):
            query_dict = {
                '$or': [],
            }

            for _id in self.weibo_ids[i * size:(i + 1) * size]:
                query_dict['$or'].append({'_id': _id})

            count, get_results = self.s.search(query=query_dict,
                                               fields=['_id', 'text'])
Ejemplo n.º 12
0
        if not line:
            break
        else:
            record=line.split()
            swds.append(record[0])

    classes=["education","culture", "fashion",'entertainment',"finance", "media", "sports", "technology"]
    ##cc = opencc.OpenCC('mix2s')##繁简体转换
   
    ####generate seed users
    ###period
    b=datetime.datetime(2012,10,1)
    tb=time.mktime(b.timetuple())
    e=datetime.datetime(2013,10,1)
    te=time.mktime(e.timetuple())
    s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline')##search by index
    s1 = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline')##search for original tweet
    total_uid_set = set()
    u_seed=[]##seed users
    u_cls={}##user and class mapping
    w_user={}##word segmentation
    n_s=0##number of seed users for each class

    sw_cls={}##seed user words statistics for each class
    sw_cls=cinitialize(classes,sw_cls,2)  
    sw={}##words from all seed users
    wp_cls={}##protowords for each class
    wp_cls=cinitialize(classes,wp_cls,1)
    Nswds=0
    exception=0
    for area in classes:
Ejemplo n.º 13
0
from operator import itemgetter, attrgetter  
import pymongo
import datetime
import time
import sys
import leveldb
import os
from xapian_weibo.xapian_backend import XapianSearch


LEVELDBPATH = '/home/mirage/leveldb'
global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field_20131012'),
                                           block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

xapian_search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)##search by index
xapian_search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)##search by index

mbr = {"culture":0, "entertainment":0, "fashion":0,'education':0,"finance":0, "sports":0, "technology":0,'media':0}
fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology']


def readProtoUser():
    f=open("/home/mirage/linhao/project_bishe/weibo/profile/user_classify/protou.txt","r")
    protou={}
    for line in f:
        area=line.split(":")[0]
        if area not in protou:
            protou[area]=set()
        for u in (line.split(":")[1]).split():
            protou[area].add(int(u))
    return protou
# -*- coding: utf-8 -*-

from xapian_weibo.xapian_backend import XapianSearch
import leveldb
import datetime
import time
import os

s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline')

LEVELDBPATH = '/home/mirage/leveldb'
weibo_multi_sentiment_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'huyue_weibo_multi_sentiment'),
                                               block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
user_daily_sentiment_count_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_user_daily_sentiment_count'),
                                                    block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

emotions_kv = {'happy': 1, 'angry': 2, 'sad': 3}
total_days = 89

today = datetime.datetime.today()
now_ts = time.mktime(datetime.datetime(today.year, today.month, today.day, 2, 0).timetuple())
now_ts = int(now_ts)
during = 24 * 3600
begin_ts = now_ts - total_days * during

query_dict = {
    'timestamp': {'$gt': begin_ts, '$lt': now_ts}
}
count, get_results = s.search(query=query_dict, fields=['user', 'id', 'timestamp'])
print count
Ejemplo n.º 15
0
 def setUp(self):
     self.weibo_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo')
     self.user_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_user', schema_version=1)
     self.begin_ts = time.mktime(datetime.datetime(2011, 12, 1).timetuple())
     self.end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple())
Ejemplo n.º 16
0
        tks = [
            token for token in s.participle(cut_filter(text))
            if 3 < len(token[0]) < 30 or token[0] in single_word_whitelist
        ]
    if cx:
        return tks
    else:
        return [tk[0] for tk in tks]


##加载分词工具结束

##加载xapian读取用户的认证类型
XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/'
xs = XapianSearch(path=XAPIAN_USER_DATA_PATH,
                  name='master_timeline_user',
                  schema_version=1)


def read_by_xapian(xs, uid):  #根据用户id,去xapian里面查找该用户的背景信息

    count, get_results = xs.search(query={'_id': uid})
    if count:
        for r in get_results():
            return r
    else:
        return 'other'


##加载xapian数据结束
Ejemplo n.º 17
0
class BenchXapianR(hurdles.BenchCase):
    def setUp(self):
        self.weibo_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/',
                                    name='master_timeline_weibo')
        self.user_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/',
                                   name='master_timeline_user',
                                   schema_version=1)
        self.begin_ts = time.mktime(datetime.datetime(2011, 12, 1).timetuple())
        self.end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple())

    def tearDown(self):
        pass

    """
    def bench_load_users(self):
        query_dict = {
            'created_at': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        count, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name'])
        print count

    def bench_load_users_then_sort(self):
        query_dict = {
            'created_at': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        count, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name'], sort_by=['created_at'])
        print count

    def bench_load_weibos(self):
        query_dict = {
            'timestamp': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        count, get_results = self.weibo_x.search(query=query_dict, fields=['_id', 'user'])
        print count
    """

    def bench_get_results_weibos(self):
        query_dict = {
            'timestamp': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        _, get_results = self.weibo_x.search(query=query_dict,
                                             fields=['_id', 'user'])
        for r in get_results():
            _id = r['_id']

    def bench_get_results_users(self, *args, **kwargs):
        query_dict = {
            'created_at': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        _, get_results = self.user_x.search(query=query_dict,
                                            fields=['_id', 'name'])
        for r in get_results():
            _id = r['_id']

    """
Ejemplo n.º 18
0
# -*- coding:utf-8 -*-

import time
import datetime

from xapian_weibo.xapian_backend import XapianSearch

s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)

begin_ts = time.mktime(datetime.datetime(2011, 1, 1).timetuple())
end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple())


"""
query_dict = {
    'created_at': {
        '$gt': begin_ts,
        '$lt': end_ts,
    }
}
count, get_results = s.search(query=query_dict, max_offset=1, fields=['_id', 'name'], sort_by=['created_at'])

print count
for r in get_results():
    print r['_id'], r['name']
"""

"""
query_dict = {
    '$or': [
        {'_id': 1934744637},
Ejemplo n.º 19
0
# -*- coding: utf-8 -*-

from xapian_weibo.xapian_backend import XapianSearch
'''
search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
print search_weibo.search(query={'_id': {'gt': 0, 'lt': 30000000000000000000000000}}, count_only=True)

search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)
print search_user.search(query={'_id': {'gt': 0, 'lt': 300000000000000}}, count_only=True)
'''
begin_ts = 0
end_ts = 12349873894898
query_dict = {
    'timestamp': {'$gt': begin_ts, '$lt': end_ts},
}
xapian_search_sentiment = XapianSearch(path='/opt/xapian_weibo/data/20130807', name='master_timeline_sentiment', schema_version=3)
print xapian_search_sentiment.search(query=query_dict, count_only=True)
Ejemplo n.º 20
0

import leveldb
import os
import random
import sys

LEVELDBPATH = '/home/mirage/leveldb'
global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field'),
                                           block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
user_daily_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_user_daily_field'),
                                          block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

from xapian_weibo.xapian_backend import XapianSearch

user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)

query_dict = {
	'_id':{
		'$gt': 0,
		'$lt': 100000000000000000000
	}
}

count, get_results = user_search.search(query=query_dict)
print 'user count: ', count

fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology']
count = 0
for r in get_results():
	uid = r['_id']
Ejemplo n.º 21
0
# -*- coding: utf-8 -*-

import json
import time
from xapian_weibo.xapian_backend import XapianSearch

XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/'
xs = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1)
with open('total_users.json', 'w') as f:
    record = 1
    ts = time.time()
    tm = ts
    for count, item in enumerate(xs.iter_all_docs()):#遍历数组的索引值和元素
        f.write(json.dumps(item) + '\n')
        if (count + 1) % 100000 == 0:
            te = time.time()
            span = round(te - tm)
            print '%s chunk spend: %s' % (record, span)
            record += 1
            tm = time.time()
    print 'total docs count: ', (count+1)
    te = time.time()
    total_span = round(te - ts)
    print 'total spend: %s' % total_span
Ejemplo n.º 22
0
class BenchXapianR(hurdles.BenchCase):
    def setUp(self):
        self.weibo_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_weibo')
        self.user_x = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/', name='master_timeline_user', schema_version=1)
        self.begin_ts = time.mktime(datetime.datetime(2011, 12, 1).timetuple())
        self.end_ts = time.mktime(datetime.datetime(2011, 12, 31).timetuple())

    def tearDown(self):
        pass

    """
    def bench_load_users(self):
        query_dict = {
            'created_at': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        count, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name'])
        print count

    def bench_load_users_then_sort(self):
        query_dict = {
            'created_at': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        count, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name'], sort_by=['created_at'])
        print count

    def bench_load_weibos(self):
        query_dict = {
            'timestamp': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        count, get_results = self.weibo_x.search(query=query_dict, fields=['_id', 'user'])
        print count
    """

    def bench_get_results_weibos(self):
        query_dict = {
            'timestamp': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        _, get_results = self.weibo_x.search(query=query_dict, fields=['_id', 'user'])
        for r in get_results():
            _id = r['_id']

    def bench_get_results_users(self, *args, **kwargs):
        query_dict = {
            'created_at': {
                '$gt': self.begin_ts,
                '$lt': self.end_ts,
            }
        }
        _, get_results = self.user_x.search(query=query_dict, fields=['_id', 'name'])
        for r in get_results():
            _id = r['_id']

    """
Ejemplo n.º 23
0
# -*- coding: utf-8 -*-

import pymongo, time, codecs, datetime
try:
    from xapian_weibo.xapian_backend import XapianSearch
    statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
except:
    pass

def con_database():
    DB_HOST = '219.224.135.60'
    DB_PORT = 27017
    DB_USER = '******'
    DB_PWD = 'root'
    connection = pymongo.Connection(DB_HOST, DB_PORT)
    db = connection.admin
    db.authenticate(DB_USER, DB_PWD)
    return connection.test_crawler_liwenwen

def main(uid, startdate, enddate):
    startts = date2ts(startdate)
    endts = date2ts(enddate)
    db = con_database()
    print db.users.find({'uid': str(uid), 'ts':{'$gte': startts, '$lte': endts}}).count()
    cursor = db.users.find({'uid': str(uid), 'ts':{'$gte': startts, '$lte': endts}})
    for weibo in cursor:
        print weibo

def date2ts(date):
    return int(time.mktime(time.strptime(date, '%Y-%m-%d')))
Ejemplo n.º 24
0
def calculate_topic(kw):
    #初始化
    topic_info = {}

    topic_index = {}
    date_list = []
    perday_count_list = []
    topic_rel_blog = []
    topic_url = []
    topic_participents = []
    topic_leader = []
    topic_date = []
    blogs_sum = 0
    comments_sum = 0
    topic_ori_blog = []
    
    city_count={}
    html = '''<select name="province" id="province" defvalue="11"><option value="34">安徽</option><option value="11">北京</option><option value="50">重庆</option><option value="35">福建</option><option value="62">甘肃</option>
                <option value="44">广东</option><option value="45">广西</option><option value="52">贵州</option><option value="46">海南</option><option value="13">河北</option>
                <option value="23">黑龙江</option><option value="41">河南</option><option value="42">湖北</option><option value="43">湖南</option><option value="15">内蒙古</option><option value="32">江苏</option>
                <option value="36">江西</option><option value="22">吉林</option><option value="21">辽宁</option><option value="64">宁夏</option><option value="63">青海</option><option value="14">山西</option><option value="37">山东</option>
                <option value="31">上海</option><option value="51">四川</option><option value="12">天津</option><option value="54">西藏</option><option value="65">新疆</option><option value="53">云南</option><option value="33">浙江</option>
                <option value="61">陕西</option><option value="71">台湾</option><option value="81">香港</option><option value="82">澳门</option><option value="400">海外</option><option value="100">其他</option></select>'''
    province_soup = BeautifulSoup(html)
    for province in province_soup.findAll('option'):
        pp = province.string
        if pp == u'海外' or pp == u'其他':
            continue
        city_count[pp] = 0
    
    gt = calendar.timegm(datetime(2012, 1, 1).timetuple())
    lt = calendar.timegm(datetime(2012, 1, 10).timetuple())

    s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
    count, get_results = s.search(query={'text': [u'%s'%kw], 'timestamp': {'$gt': gt, '$lt': lt}}, sort_by=['timestamp'], fields=['text', 'timestamp','reposts_count','comments_count','user', 'terms', '_id','retweeted_status','bmiddle_pic','geo','source','attitudes_count'])
    
    for r in get_results():
        # 获取时间与每天微博数量
	temp_date = date.fromtimestamp(r['timestamp'])
        if len(date_list) == 0:
            date_list.append(temp_date)
            perday_count_list.append(1)
        else:
            if temp_date < date_list[-1]:
                if temp_date in date_list:
                    temp_index = date_list.index(temp_date)
                    perday_count_list[temp_index] += 1
                else:
                    i = 0
                    while i < len(date_list):
                        if temp_date < date_list[0]:
                            date_list.insert(0,temp_date)
                            perday_count_list.insert(0,1)
                            break
                        else:
                            if temp_date > date_list[i] and temp_date < date_list[i+1]:
                                date_list.insert(i+1,temp_date)
                                perday_count_list.insert(i+1,1)
                                break
                            else:
                                i += 1
            if temp_date == date_list[-1]:
                perday_count_list[-1] += 1
            if temp_date > date_list[-1]:
                timedelta = date(2000,1,2)-date(2000,1,1)
                while date_list[-1] != temp_date:
                    temp_date1 = date_list[-1] + timedelta
                    date_list.append(temp_date1)
                    perday_count_list.append(0)
                perday_count_list[-1] = 1
                        

	if r['user']:
            uid = int(r['user'])
            user = get_user(uid)
            if user != None:
                if user not in topic_participents:
                    topic_participents.append(user)
                if r['retweeted_status'] == None:
                    temp_ori = {}
                    temp_ori['status'] = r
                    temp_ori['user'] = user
                    topic_ori_blog.append(temp_ori)
                if r['reposts_count'] != None and r['comments_count'] != None:
                    rc = r['reposts_count'] + r['comments_count']
                    if rc > 1500:
                        topic_leader.append(user)
                if r['reposts_count'] > 1000:
                    temp = {}
                    temp['status'] = r
                    temp['status']['created_at'] = datetime.fromtimestamp(r['timestamp'])
                    temp['status']['text'] = r['text'].decode("utf-8")
                    temp['status']['source'] = re.match('<.*?>(.*)<.*?>', r['source']).group(1).decode("utf-8")
                    temp['user'] = user
                    topic_rel_blog.append(temp)
                if r['bmiddle_pic']:
                    topic_url.append(r['bmiddle_pic'])
                if r['geo'] != None and r['geo'].has_key('province_name'):
                    p = r['geo']['province_name'].split('省')[0]
                    if p == u'海外' or p == u'其他':
                        pass
                    else:
                        city_count[p] += 1
                elif user['location']:
                    p = user['location'].split(' ')[0]
                    if p == u'海外' or p == u'其他':
                        pass
                    else:
                        city_count[p] += 1
                else:
                    pass
        else:
            pass
            
        comments_sum = comments_sum + r['comments_count']
        blogs_sum += 1

    timedelta = len(date_list)
    avg = blogs_sum/float(timedelta)
    i = 0
    persistent_index = 0
    temp_sudden = 0
    while i < int(timedelta):
	if perday_count_list[i] > avg:
	    persistent_index += 1
	    temp_sudden = perday_count_list[i]-avg+temp_sudden
	    i += 1
	else:
	    i += 1
    sudden_index = '%10.2f'%(temp_sudden/float(blogs_sum))
    coverage_index = '%10.2f'%((blogs_sum + comments_sum)/(24*float(timedelta)))
    
    media_index = 0
    top_medias = []
    medias = db.session.query(Media)
    for media in medias:
	media_name = media.mediaName
	top_medias.append(media_name)
	
    media_list = []
    for r in topic_ori_blog:
	tmedia = []
        tmedia.append(r['user']['name'])
	x = r['status']['comments_count']+r['status']['reposts_count']
	tmedia.append(x)
	media_list.append(tmedia)
	sorted(media_list, key=lambda tmedia: tmedia[1],reverse = True)
	if len(media_list) >= 20:
	    m = 0
	    while m < 20:
		if media_list[m][0] in top_medias:
		    media_index += 1
		    m += 1
		else:
		    m += 1
	else:
	    m = 0
	    while m < len(media_list):
		if media_list[m][0] in top_medias:
		    media_index += 1
		    m += 1
		else:
		    m += 1

    leader_index = len(topic_leader)

    work_list = []
    work_count = []
    fields = db.session.query(Field)
    for field in fields:
	field_name = field.fieldName
	work_list.append(field_name)
	work_count.append(0)
    for r in topic_participents:
	k = 0
	while k < len(work_list):
	    if r['userField'] == work_list[k]:
		work_count[k] += 1
		break
	    else:
		k += 1
    
    topic_index['persistent_index'] = persistent_index
    topic_index['sudden_index'] = sudden_index
    topic_index['coverage_index'] = coverage_index
    topic_index['media_index'] = media_index
    topic_index['leader_index'] = leader_index

    map_data = province_color_map(city_count)

    topic_info['topic_poster'] = topic_participents[0]['name']
    topic_info['topic_post_date'] = date_list[0]
    topic_info['topic_leader_count'] = len(topic_leader)
    topic_info['topic_participents'] = len(topic_participents)
    topic_info['blogs_sum'] = blogs_sum
    topic_info['topic_ori_blog_count'] = len(topic_ori_blog)
    topic_info['topic_url'] = topic_url
    topic_info['perday_count_list'] = perday_count_list
    topic_info['date_list'] = date_list
    topic_info['topic_rel_blog'] = topic_rel_blog
    topic_info['geo'] = map_data
    topic_info['topic_leader'] = topic_leader
    topic_info['topic_working_list'] = work_list
    topic_info['topic_working_count'] = work_count
    topic_info['topic_index'] = topic_index
    topic_info['gt'] = gt
    topic_info['lt'] = lt            
    return topic_info
Ejemplo n.º 25
0
 def setUp(self):
     self.n = 10000
     self.s = XapianSearch(path='/home/arthas/dev/xapian_weibo/data/',
                           name='master_timeline_weibo')
     self.weibo_ids = self._load_weibo_ids_from_xapian(self.n)
# -*- coding: utf-8 -*-

from xapian_weibo.xapian_backend_extra import Schema
from xapian_weibo.xapian_backend import XapianSearch
import leveldb
import datetime
import time
import os

LEVELDBPATH = '/home/mirage/leveldb'
weibo_daily_sentiment_count_global_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'lijun_weibo_daily_sentiment_count_global'),
                                                            block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

total_days = 90
emotions_kv = {'happy': 1, 'angry': 2, 'sad': 3}
s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_sentiment', schema=Schema, schema_version=1)

today = datetime.datetime.today()
now_ts = time.mktime(datetime.datetime(today.year, today.month, today.day, 2, 0).timetuple())
now_ts = int(now_ts)
during = 24 * 3600

for i in xrange(-total_days + 1, 1):
    begin_ts = now_ts + during * (i - 1)
    end_ts = now_ts + during * i
    print i, begin_ts, end_ts
    for emotion in emotions_kv.keys():
        query_dict = {
            'timestamp': {'$gt': begin_ts, '$lt': end_ts},
            'sentiment': emotions_kv[emotion],
        }
Ejemplo n.º 27
0
import os

from xapian_weibo.xapian_backend import XapianSearch

from operator import itemgetter
import datetime
import time
import leveldb


LEVELDBPATH = '/home/mirage/leveldb'
global_user_field_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_global_user_field_20131012'),
                                           block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

xapian_search_weibo = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo')  # search by index
xapian_search_user = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)  # search by index
fields_value = ['culture', 'education', 'entertainment', 'fashion', 'finance', 'media', 'sports', 'technology']


def readProtoUser():
    protou = {}
    with open("/home/mirage/linhao/project_bishe/weibo/profile/user_classify/protou.txt") as f:
        for line in f:
            area = line.split(":")[0]
            if area not in protou:
                protou[area] = set()
            for u in line.split(":")[1].split():
                protou[area].add(int(u))
    return protou