def user_sort_interface(username , time ,sort_scope , sort_norm , arg = None, st = None, et = None, isall = False, task_number=0, number=100):

    task_number = int(task_number)
    print "user_interface:", number
    user_list = []
    if isall:
        #deal with the situation of all net user
        if sort_scope == 'all_limit_keyword':
            #offline job
            #add job to es index
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count']
            if running_number > task_number-1:
                return "more than limit"
            search_id = add_task( username ,"keyword" , "all" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall, number)
            #deal with the offline task   
            return {"flag":True , "search_id" : search_id }
        elif sort_scope == 'all_nolimit':
            #online job
            print "all_sort, ", number
            user_list = all_sort_filter(None,sort_norm,time,False,number)
    else:
        if sort_scope == 'in_limit_keyword':
            #offline job
            #deal with the offline task
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count']
            if running_number > task_number-1:
                return "more than limit"
            search_id = add_task( username ,"keyword" , "in" ,'flow_text_' , during , st ,et , arg , sort_norm , sort_scope, time, isall, number)
            return {"flag":True , "search_id" : search_id }
        elif sort_scope == 'in_limit_hashtag':
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count']
            if running_number > task_number-1:
                return "more than limit"
            search_id = add_task( username ,"hashtag" , "in" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall,  number)
            return {"flag":True , "search_id" : search_id }
        else:
            #find the scope
            user_list = in_sort_filter(time , sort_norm,sort_scope , arg,[], False, number)
    
    result = make_up_user_info(user_list,isall , time , sort_norm)
    print "user_list:", len(user_list)
    return result
def user_sort_interface(username , time ,sort_scope , sort_norm , arg = None, st = None, et = None, isall = False):

    user_list = []
    if isall:
        #deal with the situation of all net user
        if sort_scope == 'all_limit_keyword':
            #offline job
            #add job to es index
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            search_id = add_task( username ,"keyword" , "all" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall)
            #deal with the offline task   
            return {"flag":True , "search_id" : search_id }
        elif sort_scope == 'all_nolimit':
            #online job
            user_list = all_sort_filter(None,sort_norm,time)
    else:
        if sort_scope == 'in_limit_keyword':
            #offline job
            #deal with the offline task
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            search_id = add_task( username ,"keyword" , "in" ,'flow_text_' , during , st ,et , arg , sort_norm , sort_scope, time, isall)
            return {"flag":True , "search_id" : search_id }
        elif sort_scope == 'in_limit_hashtag':
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            search_id = add_task( username ,"hashtag" , "in" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall)
            return {"flag":True , "search_id" : search_id }
        else:
            #find the scope
            user_list = in_sort_filter(time , sort_norm,sort_scope , arg)
    
    result = make_up_user_info(user_list,isall , time , sort_norm)
    return result
Exemple #3
0
def user_sort_interface(username,
                        time,
                        sort_scope,
                        sort_norm,
                        arg=None,
                        st=None,
                        et=None,
                        isall=False,
                        task_number=0,
                        number=100):

    task_number = int(task_number)
    print "user_interface:", number
    user_list = []
    if isall:
        #deal with the situation of all net user
        if sort_scope == 'all_limit_keyword':
            #offline job
            #add job to es index
            during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
            time = 7
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(
                index='user_rank_keyword_task',
                doc_type='user_rank_task',
                body=query_body)['count']
            if running_number > task_number - 1:
                return "more than limit"
            search_id = add_task(username, "keyword", "all", 'flow_text_',
                                 during, st, et, arg, sort_norm, sort_scope,
                                 time, isall, number)
            #deal with the offline task
            return {"flag": True, "search_id": search_id}
        elif sort_scope == 'all_nolimit':
            #online job
            print "all_sort, ", number
            user_list = all_sort_filter(None, sort_norm, time, False, number)
    else:
        if sort_scope == 'in_limit_keyword':
            #offline job
            #deal with the offline task
            during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
            time = 7
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(
                index='user_rank_keyword_task',
                doc_type='user_rank_task',
                body=query_body)['count']
            if running_number > task_number - 1:
                return "more than limit"
            search_id = add_task(username, "keyword", "in", 'flow_text_',
                                 during, st, et, arg, sort_norm, sort_scope,
                                 time, isall, number)
            return {"flag": True, "search_id": search_id}
        elif sort_scope == 'in_limit_hashtag':
            during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
            time = 7
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(
                index='user_rank_keyword_task',
                doc_type='user_rank_task',
                body=query_body)['count']
            if running_number > task_number - 1:
                return "more than limit"
            search_id = add_task(username, "hashtag", "in", 'flow_text_',
                                 during, st, et, arg, sort_norm, sort_scope,
                                 time, isall, number)
            return {"flag": True, "search_id": search_id}
        else:
            #find the scope
            user_list = in_sort_filter(time, sort_norm, sort_scope, arg, [],
                                       False, number)

    result = make_up_user_info(user_list, isall, time, sort_norm)
    print "user_list:", len(user_list)
    return result
Exemple #4
0
def key_words_search(task_id,
                     search_type,
                     pre,
                     during,
                     start_time,
                     keyword_list,
                     search_key='',
                     sort_norm='',
                     sort_scope='',
                     time=1,
                     isall=False,
                     number=100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix": {"text": "#" + key + "#"}})
        else:
            should.append({"wildcard": {"text": "*" + key + "*"}})
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []

    query_body = {
        "query": {
            "bool": {
                "must": should
            }
        },
        "sort": {
            "user_fansnum": {
                "order": "desc"
            }
        },
        "size": 5000
    }

    results = es_flow_text.search(index=index_list,
                                  doc_type='text',
                                  body=query_body,
                                  _source=False,
                                  fields=[
                                      "uid", "user_fansnum", "text",
                                      "message_type", "sentiment", "timestamp",
                                      "geo", "retweeted", "comment"
                                  ])["hits"]["hits"]

    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results:
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1

    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list:  # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME,
                                                 doc_type=USER_INDEX_TYPE,
                                                 body={"ids": un_uid_list},
                                                 _source=False,
                                                 fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0],
                                          results[index]['_id'])
                text_results.extend([
                    results[index]['fields']['uid'][0],
                    results[index]['fields']['user_fansnum'][0],
                    results[index]['fields']['text'][0],
                    results[index]['fields']['message_type'][0],
                    results[index]['fields']['sentiment'][0],
                    ts2date(results[index]['fields']['timestamp'][0]),
                    results[index]['fields']['geo'][0],
                    results[index]['fields']['retweeted'][0],
                    results[index]['fields']['comment'][0], nick_name,
                    weibo_url
                ])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time, sort_norm, sort_scope, None,
                                      portrait_list, True, number)  # sort
    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user",
                                              doc_type="user",
                                              body={"ids": un_uid_list},
                                              fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0],
                                      results[index]['_id'])
            text_results.append([
                item['fields']['uid'][0], item['fields']['user_fansnum'][0],
                item['fields']['text'][0], item['fields']['message_type'][0],
                item['fields']['sentiment'][0],
                ts2date(item['fields']['timestamp'][0]),
                results[index]['fields']['geo'][0],
                results[index]['fields']['retweeted'][0],
                results[index]['fields']['comment'][0], nick_name, weibo_url
            ])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True,
                                   number)

    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list, isall, time, sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX,
                                       doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                       id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(text_results)
    item['number'] = len(results)
    es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                           doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                           id=task_id,
                           body=item)

    return "1"
Exemple #5
0
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = ''  ,time = 7 , isall = False, number = 100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix":{"text": "#" +  key + "#"}})
        else:    
            should.append({"wildcard":{"text": "*" +key + "*"}})    
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []
    sorted_text_results = []

    query_body = {
        "query":{
            "bool":{
                "must":should
             }
        },
        "sort":{"user_fansnum":{"order":"desc"}},
        "size":5000
    }
                    
    results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"]
    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results :
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1
    
    #get_all_filed(sort_norm , time)
    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list : # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])    
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id'])
                text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort
            for iter_uid in uid_list:
                iter_index = portrait_list.index(iter_uid)
                sorted_text_results.append(text_results[i])

    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id'])
            text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number)
        sorted_text_results = []
        f = open("small.txt", "wb")
        for iter_uid in uid_list:
            iter_index = un_uid_list.index(iter_uid)
            f.write(str(iter_uid)+"\n")
            sorted_text_results.append(text_results[iter_index])
        f.close()
    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list,isall,time,sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(sorted_text_results)
    item['number'] = len(results)
    es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id,  body=item)

    return "1"
            
            for item in result :
                in_set.add(item['_id'].encode("utf-8") )

            result_list = list(uid_set & in_set)
        except Exception,e:
            print e
            raise  Exception('user_list failed!')      

    else :
        result_list = list(uid_set)
    if not isall:
        uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , result_list , True)
    else:
        uid_list = all_sort_filter(result_list , sort_norm , time ,True)
    results = make_up_user_info(uid_list,isall,time,sort_norm)

    query = {"query":{"bool":{"must":[{"term":{"user_rank_task.user_ts":search_key}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"facets":{}}

    if True:
        result = es_9200.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE , body = query)['hits']['hits']
        search_id = result[0]['_id']
        item = result[0]['_source']
        item['status'] = 1
        item['result'] = json.dumps(results)
        es_9200.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=search_id,  body=item)
    return results
   
    
def scan_offlice_task():
    
            for item in result:
                in_set.add(item['_id'].encode("utf-8"))

            result_list = list(uid_set & in_set)
        except Exception, e:
            print e
            raise Exception('user_list failed!')

    else:
        result_list = list(uid_set)
    if not isall:
        uid_list = in_sort_filter(time, sort_norm, sort_scope, None,
                                  result_list, True)
    else:
        uid_list = all_sort_filter(result_list, sort_norm, time, True)
    results = make_up_user_info(uid_list, isall, time, sort_norm)

    query = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "user_rank_task.user_ts": search_key
                    }
                }],
                "must_not": [],
                "should": []
            }
        },
        "from": 0,
        "size": 10,