Example #1
0
def acquire_user_by_id(uid):
    XAPIAN_USER_DATA_PATH = "/home/xapian/xapian_user/"
    user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name="master_timeline_user", schema_version=1)
    result = user_search.search_by_id(int(uid), fields=["name", "location", "followers_count", "friends_count"])
    user = {}
    if result:
        user["name"] = result["name"]
        user["location"] = result["location"]
        user["count1"] = result["followers_count"]
        user["count2"] = result["friends_count"]

    return user
Example #2
0
def acquire_user_by_id(uid):
    user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH, name='master_timeline_user', schema_version=1)
    result = user_search.search_by_id(int(uid), fields=['name', 'location', 'followers_count', 'friends_count', 'profile_image_url'])
    user = {}

    if result:
        user['name'] = result['name']
        user['location'] = result['location']
        user['followers_count'] = result['followers_count']
        user['friends_count'] = result['friends_count']
        user['profile_image_url'] = result['profile_image_url']
    else:
        return None

    return user
Example #3
0
def acquire_user_by_id(uid):
    XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/'
    user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH,
                               name='master_timeline_user',
                               schema_version=1)
    result = user_search.search_by_id(
        int(uid),
        fields=['name', 'location', 'followers_count', 'friends_count'])
    user = {}
    if result:
        user['name'] = result['name']
        user['location'] = result['location']
        user['count1'] = result['followers_count']
        user['count2'] = result['friends_count']

    return user
Example #4
0
def getXapianWeiboByDate(datestr):
    # datestr: 20130908
    stub_file = path + datestr
    if os.path.exists(stub_file):
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        return None
Example #5
0
def getXapianWeiboByTopic(topic_id='545f4c22cf198b18c57b8014'):
    stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str(
        topic_id)
    if os.path.exists(stub_file):
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        return None
Example #6
0
def getXapianWeiboByTopic(topic_id='54635178e74050a373a1b939'):
    stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str(
        topic_id)
    if os.path.exists(stub_file):
        print 'stub exist'
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        print 'stub not exist'
        return None
def getXapianWeiboByTopic(topic_id='54ccbfab5a220134d9f7fc1b37'):
    stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str(
        topic_id)
    if os.path.exists(stub_file):
        print 'stub exist'
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        print 'stub not exist'
        return None
Example #8
0
def getXapianWeiboByTopic(topic):
    stub_file = '/home/ubuntu4/ljh/csv/stub/master_timeline_weibo_topic'
    print stub_file
    if os.path.exists(stub_file):
        print 'stub exist'
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        print 'stub not exist'
        return None
Example #9
0
def getXapianWeiboByDate(datestr):
    # datestr: 20130908
    stub_file = path + datestr
    print stub_file
    if os.path.exists(stub_file):
        print 'step--stub exist'
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        print 'stub not exist'
        return None
Example #10
0
def getXapianWeiboByTopic(topic, start_ts, end_ts):
    topic_id  = topic2xapian(topic, start_ts, end_ts)
    XAPIAN_WEIBO_TOPIC_DATA_PATH = '/home/xapian/xapian_weibo_topic/'
    stub_file = XAPIAN_WEIBO_TOPIC_DATA_PATH + 'stub/xapian_weibo_topic_stub_' + str(topic_id)
    if os.path.exists(stub_file):
        print 'stub exist'
        xapian_search_weibo = XapianSearch(stub=stub_file, schema_version='5')
        return xapian_search_weibo
    else:
        print 'stub not exist'
        return None
Example #11
0
def acquire_user_by_id(uid):
    user_search = XapianSearch(path=XAPIAN_USER_DATA_PATH,
                               name='master_timeline_user',
                               schema_version=1)
    result = user_search.search_by_id(int(uid),
                                      fields=[
                                          'name', 'location',
                                          'followers_count', 'friends_count',
                                          'profile_image_url'
                                      ])
    user = {}

    if result:
        user['name'] = result['name']
        user['location'] = result['location']
        user['followers_count'] = result['followers_count']
        user['friends_count'] = result['friends_count']
        user['profile_image_url'] = result['profile_image_url']
    else:
        return None

    return user
Example #12
0
def getXapianWeiboByDuration(datestr_list):
    stub_file_list = []

    for datestr in datestr_list:
        stub_file = path + datestr
        if os.path.exists(stub_file):
            stub_file_list.append(stub_file)

    if len(stub_file_list):
        xapian_search_weibo = XapianSearch(stub=stub_file_list,
                                           include_remote=True,
                                           schema_version='5')
        return xapian_search_weibo

    else:
        return None
Example #13
0
               'university', 'homeadmin', 'abroadadmin', 'homemedia', 'abroadmedia', 'folkorg', \
               'lawyer', 'politician', 'mediaworker', 'activer', 'grassroot', 'other']
DOMAIN_ZH_LIST = [u'文化', u'教育', u'娱乐', u'时尚', u'财经', u'媒体', u'体育', u'科技', u'境外', \
                  u'高校微博', u'境内机构', u'境外机构', u'境内媒体', u'境外媒体', u'民间组织', u'律师', \
                  u'政府官员', u'媒体人士', u'活跃人士', u'草根', u'其它']

MYSQL_HOST = '219.224.135.47'
MYSQL_USER = '******'
MYSQL_DB = 'weibocase'
MONGODB_HOST = '219.224.135.47'
MONGODB_PORT = 27019
SSDB_PORT = 8888
SSDB_HOST = '219.224.135.47'  # SSDB服务器在47
XAPIAN_USER_DATA_PATH = '/home/xapian/xapian_user/'
XAPIAN_WEIBO_TOPIC_DATA_PATH = '/home/xapian/xapian_weibo_topic/'

xapian_search_user = XapianSearch(path=XAPIAN_USER_DATA_PATH,
                                  name='master_timeline_user',
                                  schema_version=1)

API_HOST = '219.224.135.47'
API_PORT = 9115
MASTER_TIMELINE_54API_MONGOD_HOST = '219.224.135.47'
MASTER_TIMELINE_54API_MONGOD_PORT = 27019
MASTER_TIMELINE_54API_WEIBO_DB = '54api_weibo_v2'
MASTER_TIMELINE_54API_USER_COLLECTION = 'master_timeline_user'
MASTER_TIMELINE_54API_WEIBO_DAILY_COLLECTION_PREFIX = 'master_timeline_weibo_weekly_'
MASTER_TIMELINE_54API_WEIBO_TOPIC_COLLECTION_PREFIX = 'master_timeline_weibo_topic_'
MASTER_TIMELINE_54API_TOPIC_COLLECTION = 'master_timeline_topic'
MASTER_TIMELINE_54API_WEIBO_REPOST_COLLECTION = 'master_timeline_weibo_repost'
Example #14
0
MINUTE = 60
FIFTEENMINUTES = 15 * MINUTE
HOUR = 3600
SIXHOURS = 6 * HOUR
DAY = 24 * HOUR
INTERVAL = TENSECONDS

REDIS_HOST = '219.224.135.48'
REDIS_PORT = 6379
USER_DOMAIN = 'user_domain' # user domain hash

BEGIN_TS = time.mktime(datetime.datetime(2013, 9, 1, 0, 0, 0).timetuple())
END_TS = time.mktime(datetime.datetime(2013, 9, 1, 0, 1, 0).timetuple())


s = XapianSearch(stub = PATH, schema_version = '5')

def _default_redis(host = REDIS_HOST, port = REDIS_PORT, db = 0):
    return redis.StrictRedis(host, port, db)
r = _default_redis()


def cron_index_topic(topic, begin_ts = BEGIN_TS, end_ts = END_TS):
    if topic and topic != '':
        topic = topic.strip()
        query_dict = {
                'timestamp':{'$gt':begin_ts,'$lt':end_ts},
                'topics':topic
                }

        count,results = s.search(query=query_dict, sort_by=[SORT_FIELD], fields=RESP_ITER_KEYS)
Example #15
0
# -*- coding:utf-8 -*-

import sys
import time
import datetime

sys.path.append('../xapian_case')
from xapian_case.xapian_backend import XapianSearch
from xapian_case.utils import top_keywords, not_low_freq_keywords, gen_mset_iter

# 默认schema_version为2
s = XapianSearch(path='/home/ubuntu3/huxiaoqian/data/20140724/20140724/',
                 name='master_timeline_weibo',
                 schema_version='5')

# import和初始化, 请使用下面的用法
# from xapian_weibo.xapian_backend import XapianSearch
# s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo')
# 查询条件有user(id),text,topic,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序)
# 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp
# 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词
# 若fields参数不指定,或者为None,则返回所有字段,除terms之外
# 如果需要返回terms,请一一指定需要的字段,并包括terms
# 简单示例如下
'''
count, get_results = s.search(query={'text': [u'男士']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id'])

print 'query1:'
#根据text查询
if count!=0:
    for r in get_results():
Example #16
0
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等
    data = xlrd.open_workbook(excel_name)
    weibos_dict = {}
    for i in child_topic_list:
        #if i == '0':
        #    continue
        weibos_dict[i] = []
        table_weibos = data.sheet_by_name(str(int(i)))
        n_row_weibos = table_weibos.nrows
        if n_row_weibos <= w_limit:
            n_rows = n_row_weibo
        else:
            n_rows = w_limit  # 考虑到数据已经根据权重从大到小排列
        for j in range(n_rows):
            line = table_weibos.row_values(j)  # 缺少根据文本查询微博文本对应的其他微博内容
            weibo_text = line[1]
            weibo_weight = line[0]
            try:
                weibos_dict[i].append((weibo_text, weibo_weight)) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的
            except:
                weibos_dict[i]=[(weibo_text, weibo_weight)]
    #print 'weibos_dict:', weibos_dict
    #获取微博具体数据,仅作测试用
    s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo',schema_version='5')
    begin_ts = 1378050300
    end_ts = 1378051200
    query_dict = {
        'timestamp': {'$gt':begin_ts, '$lt': end_ts},
        'message_type' : 2
    }
    weibos_dict_new = {}
    scount, weibo_results =s.search(query=query_dict, fields=fields_list)
    #print 'scount:', scount
    i = 0
    j = 0
    for weibo in weibo_results():
        if i==11:
            break
        weibo['text'] = weibos_dict[str(i)][j][0]
        #获取username,profileimage,weibourl
        username, profileimage = getuserinfo(weibo['user'])
        weibo['username'] = username
        weibo['profile_image_url'] = profileimage
        weibo['timestamp'] = ts2date(weibo['timestamp'])
        weibo['weibo_link'] = weiboinfo2url(weibo['user'],weibo['_id'])
        #获取username, profileimage,weibourl结束       
        weight = weibos_dict[str(i)][j][1]
        try:
            weibos_dict_new[i].append((weibo, weight))
        except:
            weibos_dict_new[i] = [(weibo, weight)]
        if j==4:
            j = 0
            i += 1
        else:
            j +=1
            
        #分割线
    for i in range(len(child_topic_list)):
        item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i]))
        item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \
                                                                OpinionTestWeibos.child_topic==i).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()
Example #17
0
# -*- coding:utf-8 -*-

import sys
import time
import datetime

sys.path.append('../xapian_case')
from xapian_case.xapian_backend import XapianSearch
from xapian_case.utils import top_keywords, not_low_freq_keywords, gen_mset_iter

# 默认schema_version为2
s = XapianSearch(path='/home/ubuntu3/huxiaoqian/data/20140724/20140724/', name='master_timeline_weibo',schema_version='5')

# import和初始化, 请使用下面的用法
# from xapian_weibo.xapian_backend import XapianSearch
# s = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo')
# 查询条件有user(id),text,topic,timestamp,reposts_count,comments_count,attitudes_count(从timestamp开始后面四个查询指标可以指定范围和排序)
# 返回字段基本和新浪api的返回字段相同,注意没有created_at,而是timestamp
# 值得注意的是新增返回字段terms,返回的是每条微博里的词和以及词频的dict(字典),所有不用自己取出来之后再分词
# 若fields参数不指定,或者为None,则返回所有字段,除terms之外
# 如果需要返回terms,请一一指定需要的字段,并包括terms
# 简单示例如下

'''
count, get_results = s.search(query={'text': [u'男士']}, sort_by=['-timestamp'], fields=['text', 'timestamp', 'user', 'terms', '_id'])

print 'query1:'
#根据text查询
if count!=0:
    for r in get_results():
        print "** " * 10
Example #18
0
def save_weibos(excel_name, topic, child_topic_list,
                w_limit):  # 这里需要根据文本内容查询相关微博id等
    data = xlrd.open_workbook(excel_name)
    weibos_dict = {}
    for i in child_topic_list:
        #if i == '0':
        #    continue
        weibos_dict[i] = []
        table_weibos = data.sheet_by_name(str(int(i)))
        n_row_weibos = table_weibos.nrows
        if n_row_weibos <= w_limit:
            n_rows = n_row_weibo
        else:
            n_rows = w_limit  # 考虑到数据已经根据权重从大到小排列
        for j in range(n_rows):
            line = table_weibos.row_values(j)  # 缺少根据文本查询微博文本对应的其他微博内容
            weibo_text = line[1]
            weibo_weight = line[0]
            try:
                weibos_dict[i].append(
                    (weibo_text, weibo_weight
                     ))  # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的
            except:
                weibos_dict[i] = [(weibo_text, weibo_weight)]
    #print 'weibos_dict:', weibos_dict
    #获取微博具体数据,仅作测试用
    s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/',
                     name='master_timeline_weibo',
                     schema_version='5')
    begin_ts = 1378050300
    end_ts = 1378051200
    query_dict = {
        'timestamp': {
            '$gt': begin_ts,
            '$lt': end_ts
        },
        'message_type': 2
    }
    weibos_dict_new = {}
    scount, weibo_results = s.search(query=query_dict, fields=fields_list)
    #print 'scount:', scount
    i = 0
    j = 0
    for weibo in weibo_results():
        if i == 11:
            break
        weibo['text'] = weibos_dict[str(i)][j][0]
        #获取username,profileimage,weibourl
        username, profileimage = getuserinfo(weibo['user'])
        weibo['username'] = username
        weibo['profile_image_url'] = profileimage
        weibo['timestamp'] = ts2date(weibo['timestamp'])
        weibo['weibo_link'] = weiboinfo2url(weibo['user'], weibo['_id'])
        #获取username, profileimage,weibourl结束
        weight = weibos_dict[str(i)][j][1]
        try:
            weibos_dict_new[i].append((weibo, weight))
        except:
            weibos_dict_new[i] = [(weibo, weight)]
        if j == 4:
            j = 0
            i += 1
        else:
            j += 1

        #分割线
    for i in range(len(child_topic_list)):
        item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i]))
        item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \
                                                                OpinionTestWeibos.child_topic==i).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()
Example #19
0
# -*- coding:utf-8 -*-

import sys
import time
import datetime

sys.path.append('../xapian_case')
from xapian_case.xapian_backend import XapianSearch
from xapian_case.utils import top_keywords, not_low_freq_keywords, gen_mset_iter

# 默认schema_version为2
s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140724/', name='master_timeline_weibo',schema_version='5')
#uesr
'''
count, get_results = s.search(query={'user': 1811093512}, fields=['text', 'timestamp', 'user', 'terms', '_id'])
print 'query1:'
if count!=0:
    for r in get_results():
        print "** " * 10
        print r['_id']
        print r['user']
        print r['text']
        print r['timestamp']
        print r['terms']

    print 'hits: %s' % count
else:
    print 'no results'
'''
get_results = s.iter_all_docs(fields=['_id', 'user', 'retweeted_uid', 'retweeted_mid', 'text', 'timestamp', 'reposts_count', 'source', 'bmiddle_pic', 'geo', 'attitudes_count', 'comments_count', 'sentiment', 'topics', 'message_type', 'terms' ])
for r in get_results: