Beispiel #1
0
def my_topic_classfiy(uid_list, datetime_list):
    topic_dict_results = {}
    topic_string_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=tw_portrait_index_name, doc_type=tw_portrait_index_type, body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'): 
            found = r['found']
            if found and r['_source'].has_key('topic'):
                topic = r['_source']['topic']
                topic_string = r['_source']['topic_string']
                topic_dict_results[uid] = json.loads(topic)
                topic_string_results[uid] = [topic_ch2en_dict[ch_topic] for ch_topic in topic_string.split('&')]
            else:
                unresolved_uids.append(uid)
        else:   #es表中目前无任何记录 
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_topic_dict = {}
    user_topic_list = {}
    if unresolved_uids:
        tw_flow_text_index_list = []
        for datetime in datetime_list:
            tw_flow_text_index_list.append(flow_text_index_name_pre + datetime)
        user_topic_data = get_filter_keywords(tw_flow_text_index_list, unresolved_uids)
        user_topic_dict, user_topic_list = topic_classfiy(unresolved_uids, user_topic_data)

        user_topic_string = {}
        for uid, topic_list in user_topic_list.items():
            li = []
            for t in topic_list:
                li.append(zh_data[name_list.index(t)].decode('utf8'))
            user_topic_string[uid] = '&'.join(li)
        user_topic = {}
        for uid in unresolved_uids:
            if uid in user_topic_dict:
                user_topic[uid] = {
                    'filter_keywords': json.dumps(user_topic_data[uid]),
                    'topic': json.dumps(user_topic_dict[uid]),
                    'topic_string': user_topic_string[uid]
                }
            else:
                user_topic[uid] = {
                    'filter_keywords': json.dumps({}),
                    'topic': json.dumps({}),
                    'topic_string': ''
                }
        save_data2es(user_topic)

    #整合
    user_topic_dict.update(topic_dict_results)
    user_topic_list.update(topic_string_results)
    return user_topic_dict, user_topic_list
Beispiel #2
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=fb_portrait_index_name,
                  doc_type=fb_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        fb_flow_text_index_list = []
        for datetime in datetime_list:
            fb_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, fb_flow_text_index_list)
        #load baseinfo
        fb_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields": ["bio_str", "category", "uid"]
        }
        try:
            search_results = es.search(index=facebook_user_index_name,
                                       doc_type=facebook_user_index_type,
                                       body=fb_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'bio_str': '',
                        'category': '',
                        'number_of_text': text_num
                    }
                #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
                if content.has_key('category'):
                    category = content.get('category')[0]
                else:
                    category = ''
                if content.has_key('bio_str'):
                    bio_str = content.get('bio_str')[0]
                else:
                    bio_str = '____'
                user_domain_data[uid]['bio_str'] = bio_str
                user_domain_data[uid]['category'] = category
        except Exception, e:
            print e
        #domain计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
Beispiel #3
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=tw_portrait_index_name,
                  doc_type=tw_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        tw_flow_text_index_list = []
        for datetime in datetime_list:
            tw_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, tw_flow_text_index_list)
        #load baseinfo
        tw_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields": ["location", "username", "description", "uid"]
        }
        try:
            search_results = es.search(index=twitter_user_index_name,
                                       doc_type=twitter_user_index_type,
                                       body=tw_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'location': '',
                        'username': '',
                        'description': '',
                        'number_of_text': text_num
                    }
                if content.has_key('location_ch'):
                    location = content.get('location_ch')[0]
                else:
                    location = ''
                if content.has_key('description_ch'):
                    description = content.get('description_ch')[0]
                else:
                    description = ''
                if content.has_key('username'):
                    username = content.get('username')[0]
                else:
                    username = ''
                user_domain_data[uid]['location'] = location
                user_domain_data[uid]['username'] = username
                user_domain_data[uid]['description'] = description
        except Exception, e:
            print e
        #domian计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)
Beispiel #4
0
def my_domain_classfiy(uid_list, datetime_list):
    domain_results = {}
    #将处理后的结果保存到数据库中,并在处理前查询数据库中是否已经有了相应内容之前存储的结果,以提高效率
    uids = uid_list
    unresolved_uids = []
    res = es.mget(index=fb_portrait_index_name,
                  doc_type=fb_portrait_index_type,
                  body={'ids': uids})['docs']
    for r in res:
        uid = r['_id']
        if r.has_key('found'):
            found = r['found']
            if found and r['_source'].has_key('domain'):
                domain = r['_source']['domain']
                domain_results[uid] = domain
            else:
                unresolved_uids.append(uid)
        else:  #es表中目前无任何记录
            unresolved_uids.append(uid)

    #未在数据库中的进行计算并存储
    user_domain = {}
    user_domain_temp = {}
    if unresolved_uids:
        fb_flow_text_index_list = []
        for datetime in datetime_list:
            fb_flow_text_index_list.append(flow_text_index_name_pre + datetime)

        user_domain_data = {}
        #load num of text
        count_result = count_text_num(unresolved_uids, fb_flow_text_index_list)
        #load baseinfo
        fb_user_query_body = {
            'query': {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {
                                    "terms": {
                                        "uid": unresolved_uids
                                    }
                                },
                            ]
                        }
                    }
                }
            },
            'size': MAX_SEARCH_SIZE,
            "fields":
            ["bio", "about", "description", "quotes", "category", "uid"]
        }
        try:
            search_results = es.search(index=facebook_user_index_name,
                                       doc_type=facebook_user_index_type,
                                       body=fb_user_query_body)['hits']['hits']
            for item in search_results:
                content = item['fields']
                uid = content['uid'][0]
                if not uid in user_domain_data:
                    text_num = count_result[uid]
                    user_domain_data[uid] = {
                        'bio_str': '',
                        'bio_list': [],
                        'category': '',
                        'number_of_text': text_num
                    }
                #对于长文本,Goslate 会在标点换行等分隔处把文本分拆为若干接近 2000 字节的子文本,再一一查询,最后将翻译结果拼接后返回用户。通过这种方式,Goslate 突破了文本长度的限制。
                if content.has_key('category'):
                    category = content.get('category')[0]
                else:
                    category = ''
                if content.has_key('description'):
                    description = content.get(
                        'description'
                    )[0][:
                         1000]  #有的用户描述信息之类的太长了……3000+,没有卵用,而且翻译起来会出现一些问题,截取一部分就行了
                else:
                    description = ''
                if content.has_key('quotes'):
                    quotes = content.get('quotes')[0][:1000]
                else:
                    quotes = ''
                if content.has_key('bio'):
                    bio = content.get('bio')[0][:1000]
                else:
                    bio = ''
                if content.has_key('about'):
                    about = content.get('about')[0][:1000]
                else:
                    about = ''
                user_domain_data[uid]['bio_list'] = [
                    quotes, bio, about, description
                ]
                user_domain_data[uid]['category'] = category
        except Exception, e:
            print e
        #由于一个用户请求一次翻译太耗时,所以统一批量翻译
        trans_uid_list = []
        untrans_bio_data = []
        cut = 100
        n = len(user_domain_data) / cut
        for uid, content in user_domain_data.items():
            trans_uid_list.append(uid)
            untrans_bio_data.extend(content['bio_list'])
            content.pop('bio_list')
            if n:
                if len(trans_uid_list) % cut == 0:
                    temp_trans_bio_data = trans_bio_data(untrans_bio_data)
                    for i in range(len(trans_uid_list)):
                        uid = trans_uid_list[i]
                        user_domain_data[uid]['bio_str'] = '_'.join(
                            temp_trans_bio_data[4 * i:4 * i + 4])
                    trans_uid_list = []
                    untrans_bio_data = []
                    n = n - 1
            else:
                if len(trans_uid_list) == (len(user_domain_data) % cut):
                    temp_trans_bio_data = trans_bio_data(untrans_bio_data)
                    for i in range(len(trans_uid_list)):
                        uid = trans_uid_list[i]
                        user_domain_data[uid]['bio_str'] = '_'.join(
                            temp_trans_bio_data[4 * i:4 * i + 4])
                    trans_uid_list = []
                    untrans_bio_data = []
        #domian计算
        user_domain_temp = domain_main(user_domain_data)
        for uid in unresolved_uids:
            if uid in user_domain_temp:
                user_domain[uid] = {'domain': user_domain_temp[uid]}
            else:
                user_domain_temp[uid] = 'other'
                user_domain[uid] = {'domain': 'other'}
        save_data2es(user_domain)