def repost_search(topic, startts, endts):
    repost_list = []
    ts_arr = []
    if topic and topic != '':
        query_body = {   #把原创和转发的数据都取出来 (有size的限制,有没有需要分别取转发和原创?)
            'query':{
                'bool':{
                    'must':[
                        {'terms':{'message_type':[1,3]}}, 
                        {'range':{
                            'timestamp':{'gte': startts, 'lt':endts} 
                        }
                    }]
                }
            },
            #'sort':{"message_type":{"order":"desc"}},
            'size':MAX_REPOST_SEARCH_SIZE
        }
        repost_search = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)['hits']['hits']
        #print repost_search
        for weibo in repost_search:
            location_dict = geo_list(weibo['_source'],topic)
            if location_dict:
                repost_list.append(location_dict)
                ts_arr.append(weibo['_source']['timestamp'])
            #print len(repost_list)

        save_rt_results(topic, repost_list)

    return sorted(list(set(ts_arr))), repost_list
Beispiel #2
0
def compute_mtype_weibo(topic,begin_ts,end_ts,w_limit):
	#print topic
	mtypes = ['1', '2', '3']
	all_mtype_weibo = {}
	results = {}

	for mtype in mtypes:
		query_body = {
			'query':{
				'bool':{
					'must':[
						{'term':{'message_type': mtype}},
						{'range':{
							'timestamp':{'gte': begin_ts, 'lt':end_ts}
						}
						}]
				}
			},
			'sort':{"retweeted":{"order":"desc"}},
			'size':w_limit
		}

		mtype_weibo = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)['hits']['hits']
		#print mtype_weibo
		if len(mtype_weibo) > 0:
			all_mtype_weibo[mtype] = []
			for i in range(0, len(mtype_weibo)):
				all_mtype_weibo[mtype].append(mtype_weibo[i]['_source'])
		else:
			all_mtype_weibo[mtype] = []

		results[end_ts] = all_mtype_weibo
		#print results
		return results
def getEsIndexName(topic_name):
    #body={"query": {"match_all": {}}}
    query_body = {'query': {'match': {'name': topic_name}}}
    try:
        res = weibo_es.search(index='topics', body=query_body)['hits']['hits']
        return res[0]['_source']['index_name']
    except:
        return -1
def compute_mtype_keywords(topic, begin_ts, end_ts, during, k_limit):
    all_keyword_dict = {}
    mtype_with_keyword = {}
    mtypes = ['1', '2', '3']  #三种微博类型:原创、转发、评论
    for mtype in mtypes:
        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'message_type': mtype
                        }
                    }, {
                        'range': {
                            'timestamp': {
                                'gte': begin_ts,
                                'lt': end_ts
                            }
                        }
                    }]
                }
            },
            'aggs': {
                'all_interests': {
                    'terms': {
                        'field': 'keywords_string',
                        'size': k_limit
                    }
                }
            }
        }

        show_keywords_dict = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)\
            ['aggregations']['all_interests']['buckets']
        #print 'show_keywords_dict::::::::::::::::::::',show_keywords_dict

        keyword_dict = {}
        for keyword in show_keywords_dict:
            key = keyword['key']
            count = keyword['doc_count']
            try:
                keyword_dict[key] += count
            except:
                keyword_dict[key] = count
        mtype_with_keyword[mtype] = sorted(keyword_dict.items(),
                                           key=lambda x: x[1],
                                           reverse=True)[:k_limit]

    #mtype_with_keyword['limit'] = k_limit
    #mtype_with_keyword['during'] = during
    #all_keyword_dict[end_ts] = mtype_with_keyword
    #all_keyword_dict['limit'] = k_limit

    #results = all_keyword_dict

    #print results
    #return results
    return mtype_with_keyword
Beispiel #5
0
def compute_sentiment_count(topic, begin_ts, end_ts, during):
    all_sentiment_dict = {}
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }
            }
        },
        'aggs': {
            'all_interests': {
                'terms': {
                    'field': 'sentiment',
                    'size': SENTIMENT_TYPE_COUNT
                }  #,
                # 'aggs':{
                #     'geo':{
                #         'terms':{
                #         'field':'geo'
                #         }
                #     }
                # }
            }
        }
    }
    weibo_sentiment_count = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)\
                                ['aggregations']['all_interests']['buckets']
    #print 'wwwwwwwwwwwwwwwwwwwwww'
    #print weibo_sentiment_count
    iter_sentiment_dict = {}
    for sentiment_item in weibo_sentiment_count:
        sentiment = sentiment_item['key']
        sentiment_count = sentiment_item['doc_count']
        try:
            iter_sentiment_dict[sentiment] += sentiment_count  #'1':4
        except:
            iter_sentiment_dict[sentiment] = sentiment_count
    #print '============================'
    #all_sentiment_dict[end_ts] = iter_sentiment_dict   #按时间段存各个情绪的数量值
    #results = sorted(all_sentiment_dict.items(), key=lambda x:x[0])  #按时间段对情绪数量排序
    #results = all_sentiment_dict
    #print type(results)
    #trend_results = {}
    #for sentiment in SENTIMENT_FIRST:
    #    trend_results[sentiment] = [[item[0], item[1][sentiment]] for item in sort_sentiment_dict]
    #results = trend_results

    #print results
    #save_rt_results('count', topic, results, during)
    #save_rt_results_es('count', topic, results, during)
    #return results
    return iter_sentiment_dict
def geo_list(r, topic):  #对每条微博得到转微博、mid、话题、时间、原地理位置、转发地理位置
    # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx}
    location_dict = {}
    message_type = r['message_type']
    if message_type == 3:  # 转发
        geo = r['geo'].encode('utf8')
        try:
            repost_location = str(split_city(geo))  #把元组转换成了字符串
        except:
            return None
        #print r['mid'],r['root_mid']
        if r['root_mid']:
            query_body = {
                'query': {
                    'filtered': {
                        'filter': {
                            'term': {
                                'mid': r['root_mid']
                            }
                        }
                    }
                }
            }
            item = weibo_es.search(index=topic,
                                   doc_type=weibo_index_type,
                                   body=query_body)['hits']['hits']
            if item != []:
                try:
                    origin_location = str(
                        split_city(item[0]['_source']['geo'].encode('utf8')))
                except:
                    return None
                #if repost_location[2:4] != 'unknown' and origin_location[2:4] != 'un':
                if repost_location[2:4] != 'un' and origin_location[
                        2:4] != 'un':  # str(['unknown','unknown'])所以2,3位‘un’
                    location_dict['original'] = 0
                    location_dict['mid'] = r['mid']
                    location_dict['topic'] = topic
                    location_dict['ts'] = r['timestamp']
                    location_dict['origin_location'] = origin_location
                    location_dict['repost_location'] = repost_location
                    return location_dict
    else:
        geo = r['geo'].encode('utf8')
        try:
            origin_location = str(split_city(geo))
        except:
            return None
        if origin_location[2:4] != 'un':
            location_dict['original'] = 1
            location_dict['mid'] = r['mid']
            location_dict['topic'] = topic
            location_dict['ts'] = r['timestamp']
            location_dict['origin_location'] = origin_location
            location_dict['repost_location'] = None
            return location_dict

    return None
Beispiel #7
0
def get_topic_weibo(topic, en_name, start_ts, end_ts):
    query_body = {'query': {'match_all': {}}, 'sort': 'timestamp', 'size': 1}
    try:
        task_exist = weibo_es.search(index=en_name,
                                     doc_type=topic_index_type,
                                     body=query_body)['hits']['hits']
    except:
        get_mappings(en_name)
    find_flow_texts(start_ts, end_ts, topic, en_name)
Beispiel #8
0
def save_trend_maker(topic, date, windowsize, trend_maker):
    #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理
    makers = trend_maker
    rank = 0
    user_exist_list = []
    #db.session.execute("DROP TABLE trend_maker")
    #db.session.create(TrendMaker)
    '''
    items_exist = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\
                                                      TrendMaker.date==date ,\
                                                      TrendMaker.windowsize==windowsize).all()
    
    for item_exist in items_exist:
        db.session.delete(item_exist)
    db.session.commit()
    '''
    for maker in makers:
        uid = maker[0]
        if uid in user_exist_list:
            continue
        user_exist_list.append(uid)
        if rank >= trend_maker_count:
            break
        rank += 1
        mid = maker[1]
        value = maker[2]  #内容相关度---关键词命中个数
        key_item = maker[3]  # 命中的关键词
        user_info = get_user_info(uid)
        query_body = {
            'query': {
                'bool': {
                    'must': {
                        'term': {
                            'mid': mid
                        }
                    }
                }
            },
            'size': 1000000  # 返回条数限制 待删
        }
        weibo_info = weibo_es.search(index=topic,
                                     doc_type=weibo_index_type,
                                     body=query_body)['hits']['hits']
        print 'trend_maker weibo_info:', weibo_info
        #domain = uid2domain(uid)
        domain = 'Unknown'
        timestamp = int(weibo_info[0]['_source']['timestamp'])
        # 修改model
        item = TrendMaker(topic, date, windowsize, uid, timestamp,
                          json.dumps(user_info),
                          json.dumps(weibo_info), domain, rank,
                          json.dumps(value), json.dumps(key_item))
        print item
        db.session.add(item)
    db.session.commit()
    print 'save_trend_maker success'
Beispiel #9
0
def test_weibo():
	query_body = {
		'query': {
            'match_all': {}
        },
        'size':1,
	    'sort':{'retweeted':{'order':'desc'}}
	}
	weibo = weibo_es.search(index='aoyunhui',doc_type=weibo_index_type,body=query_body)['hits']['hits'][0]['_source']
	mid = weibo['mid']
	ts = weibo['timestamp']
	print mid,ts
	find_tree(mid,ts)
Beispiel #10
0
def getWeiboByNameStEt(topic,start_date,end_date):
    print weibo_es
    query_body= {
        'query':{
            'filtered':{
                'filter':{
                    'range':{'timestamp':{'gte':start_date,'lte':end_date}}
                    }
            }
        }
    }
    search_result = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)
    print search_result
    return search_result
Beispiel #11
0
def test(topic, begin_ts, end_ts):
    kcounts_dict = {}
    #unit = 900 # PropagateKeywords unit=900
    #limit = 50
    limit = fu_tr_top_keyword
    if MYSQL_TOPIC_LEN == 0:
        topic = topic[:20]
    #print 'get_keywords begin_ts:', begin_ts
    #print 'get_keywords end_ts:', end_ts
    print topic, unit, limit

    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'en_name': topic
                    }
                }]
            }
        },
        'size': 1000000  # 返回条数限制 待删
    }

    items = weibo_es.search(index=index_name,
                            doc_type=index_type,
                            body=query_body)['hits']['hits']

    for item in items:
        #kcount_dict = parseKcount(item['_source']['kcount'])
        print 'time_results::::::', type(item['_source']['time_results'])
        #time_results = json.loads(item['_source']['time_results'])
        time_results = item['_source']['time_results']
        time_results = json.loads(time_results)
        print 'time_results::::::', type(time_results)
        kcount_items = time_results['kcount']
        during = time_results['during']
        k_limit = time_results['k_limit']

        print 'kcount_items::::::', kcount_items
        for kcount_item, value in kcount_items.iteritems():
            print 'kcount_item:::::::::::::;', kcount_item
            print 'value::::::::::::::::::', value
            for k in value:
                print 'k::::::::::::', k
Beispiel #12
0
def subopinion_content(topic, start_ts, end_ts, weibo_limit):
    query_body = {
        'query': {
            'bool': {
                'must_not': [{
                    'wildcard': {
                        'text': '*【*】*'
                    }
                }],
                'must': [{
                    'range': {
                        'timestamp': {
                            'lt': end_ts,
                            'gte': start_ts
                        }
                    }
                }]
            }
        },
        'size': weibo_limit
    }
    subopinion_results = weibo_es.search(
        index=topic, doc_type=weibo_index_type,
        body=query_body)['hits']['hits']  #['_source']
    normal_list = []
    for key_weibo in subopinion_results:
        text_weibo = key_weibo['_source']['text']
        mid_weibo = key_weibo['_source']['mid']
        timestamp = key_weibo['_source']['timestamp']
        try:
            comment = key_weibo['_source']['comment']
        except:
            comment = 0
        try:
            retweeted = key_weibo['_source']['retweeted']
        except:
            retweeted = 0
        uid = key_weibo['_source']['uid']
        normal_list.append({
            'news_id': 'weibo',
            'content': text_weibo,
            'id': mid_weibo,
            'datetime': ts2datetime_full(timestamp),
            'comment': comment,
            'retweeted': retweeted,
            'uid': uid
        })
    return normal_list
Beispiel #13
0
def weibo_TopicNameTransfer(topicname, start_ts, end_ts):
    # 测试用时间戳 待删
    begin_ts = start_ts
    end_ts = end_ts
    query_body = {
        'query':{
          'bool':{
            'must':[
              {'term':{'name': topicname}}
              ]
          }
        }
      }
    weibo_pinyin_name = weibo_es.search(index='topics', doc_type=weibo_index_type, body=query_body)['hits']['hits']
    print weibo_pinyin_name
    print weibo_pinyin_name[0]['_source']['en_name']
    return weibo_pinyin_name[0]['_source']['en_name']
def get_topicweibo_byid(uid, topic):
    query_body = {
        'query': {
            'bool': {
                'must': {
                    'term': {
                        'uid': uid
                    }
                }
            }
        },
        'size': 1000
    }
    es_search_weibos = weibo_es.search(index=topic,
                                       doc_type=weibo_index_type,
                                       body=query_body)['hits']['hits']
    return es_search_weibos
Beispiel #15
0
def save_trend_pusher(topic, date, windowsize, trend_pusher):
    #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理
    pushers = trend_pusher
    rank = 0
    user_exist_list = []
    items_exist = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\
                                                       TrendPusher.date==date ,\
                                                       TrendPusher.windowsize==windowsize).all()
    for item_exist in items_exist:
        db.session.delete(item_exist)
    db.session.commit()
    for pusher in pushers:
        uid = pusher[0]
        if uid in user_exist_list:
            continue
        user_exist_list.append(uid)
        if rank >= trend_pusher_count:
            break
        rank += 1
        mid = pusher[1]
        user_info = get_user_info(uid)
        #weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list)
        query_body = {
            'query': {
                'bool': {
                    'must': {
                        'term': {
                            'mid': mid
                        }
                    }
                }
            },
            'size': 1000000  # 返回条数限制 待删
        }
        weibo_info = weibo_es.search(index=topic,
                                     doc_type=weibo_index_type,
                                     body=query_body)['hits']['hits']
        #domain = uid2domain(uid)
        timestamp = int(weibo_info[0]['_source']['timestamp'])
        item = TrendPusher(topic, date, windowsize, uid, timestamp,
                           json.dumps(user_info), json.dumps(weibo_info),
                           'Unknown', rank)
        db.session.add(item)
    db.session.commit()
    print 'save_trend_pusher success'
Beispiel #16
0
def read_long_gexf(topic, identifyDate, identifyWindow):
	name = str(identifyDate)+str(identifyWindow)
	query_body = {
		#"term":{"date":identifyDate}
		"query":{"match_phrase":{"name":name}}
	}
	index_name = topic+'_gexffile'
	try:
		res = es.search(index=index_name, body=query_body)['hits']['hits']	
	except:
		return []
	print es,index_name,query_body
	if len(res) > 0:
		#print '!!!!'
		#print type(json.loads(res[0]['_source']['gexf']))
		return res[0]['_source']['gexf']
	else:
		return []
def read_long_gexf(topic, identifyDate, identifyWindow):
	name = str(identifyDate)+str(identifyWindow)
	query_body = {
		#"term":{"date":identifyDate}
		"query":{"match_phrase":{"name":name}}
	}
	index_name = topic+'_gexffile'
	try:
		res = es.search(index=index_name, body=query_body)['hits']['hits']	
	except:
		return []
	print es,index_name,query_body
	if len(res) > 0:
		#print '!!!!'
		#print type(json.loads(res[0]['_source']['gexf']))
		return res[0]['_source']['gexf']
	else:
		return []
def compute_mtype_count(topic, begin_ts, end_ts, during):
    all_mtype_dict = {}
    #print begin_ts,end_ts
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }
            }
        },
        'aggs': {
            'all_interests': {
                'terms': {
                    'field': 'message_type',
                    'size': MTYPE_COUNT
                }
            }
        }
    }

    weibo_mtype_count = weibo_es.search(index=topic, doc_type=weibo_index_type,body=query_body)\
         ['aggregations']['all_interests']['buckets']
    print 'weibo_mtype_count:::::::::::::::::', weibo_mtype_count
    print begin_ts, end_ts, len(weibo_mtype_count)
    iter_mtype_dict = {}
    for mtype_item in weibo_mtype_count:
        mtype = mtype_item['key']
        mtype_count = mtype_item['doc_count']
        try:
            iter_mtype_dict[mtype] += mtype_count
        except:
            iter_mtype_dict[mtype] = mtype_count
    #iter_mtype_dict['during'] = during
    #all_mtype_dict[end_ts] = iter_mtype_dict

    #results = all_mtype_dict
    #return results
    return iter_mtype_dict
Beispiel #19
0
def find_tree(mid,ts):
	query_body = {
		'query':{
		    'bool':{
		        'must':[
		            {'term':{'root_mid':mid}},  #一个话题,不同情绪下给定时间里按关键词聚合
		            {'range':{
		                'timestamp':{'gte': ts} 
		            }
		        }]
		    }
		},
        'size':1000,
        'sort':{'timestamp':{'order':'asc'}}
	}
	weibo = weibo_es.search(index='aoyunhui',doc_type=weibo_index_type,body=query_body)['hits']['hits']
	for content in weibo:
		if content['directed_uid']:
			字典?
Beispiel #20
0
def news_content(topic, start_ts, end_ts, news_limit=NEWS_LIMIT):
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'wildcard': {
                        'text': '*【*】*'
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'lt': end_ts,
                            'gte': start_ts
                        }
                    }
                }]
            }
        },
        'size': news_limit
    }
    news_results = weibo_es.search(
        index=topic, doc_type=weibo_index_type,
        body=query_body)['hits']['hits']  #['_source']
    # print topic,weibo_index_type,start_ts,end_ts,query_body
    # print news_results
    news_list = []
    for key_weibo in news_results:
        text_weibo = key_weibo['_source']['text']
        mid_weibo = key_weibo['_source']['mid']
        timestamp = key_weibo['_source']['timestamp']
        comment = key_weibo['_source']['comment']
        retweeted = key_weibo['_source']['retweeted']
        uid = key_weibo['_source']['uid']
        news_list.append({
            'news_id': 'news',
            'content168': text_weibo,
            'id': mid_weibo,
            'datetime': ts2datetime_full(timestamp),
            'comment': comment,
            'retweeted': retweeted
        })
    return news_list
Beispiel #21
0
def test(topic, start_ts, end_ts):
    print start_ts, end_ts
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'range': {
                        'timestamp': {
                            'gte': start_ts,
                            'lt': end_ts
                        }
                    }
                }
            }
        }
    }
    weibo = weibo_es.search(index=topic,
                            doc_type=weibo_index_type,
                            body=query_body)['hits']['hits']  #字典
    print weibo
def getEsWeiboByTopic(topic_index_name):
    '''
	query_body = {
			'query':{
				'bool':{
					'must':[
						{'term':{'sentiment':sentiment}},  #一个话题,不同情绪下给定时间里按关键词聚合
						{'range':{
							'timestamp':{'gte': begin_ts, 'lt':end_ts} 
						}
					}]
				}
			},
			'sort':{"retweeted":{"order":"desc"}},
			'size':w_limit
		}
	'''
    #body={"query": {"match_all": {}}}
    res = weibo_es.search(index=topic_index_name, doc_type=weibo_index_type, body={"query": {"match_all": {}}},\
     size = 1000000)['hits']['hits']
    return res
def getWeiboByNameStEt(topic, start_date, end_date):
    print weibo_es
    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'range': {
                        'timestamp': {
                            'gte': start_date,
                            'lte': end_date
                        }
                    }
                }
            }
        }
    }
    search_result = weibo_es.search(index=topic,
                                    doc_type=weibo_index_type,
                                    body=query_body)
    print search_result
    return search_result
def counts_aggs(en_name, start_ts, end_ts):

    index_name = en_name
    index_type = 'text'

    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'pinyin_task_name': en_name
                    }
                }, {
                    'range': {
                        'timestamp': {
                            'gte': start_ts,
                            'lt': end_ts
                        }
                    }
                }]
            }
        },
        'aggs': {
            'diff_uids': {
                'cardinality': {
                    'field': 'uid'
                }
            }
        }
    }

    result = weibo_es.search(index=index_name,
                             doc_type=index_type,
                             body=query_body)

    weibo_counts = result['hits']['total']
    uid_counts = result['aggregations']['diff_uids']['value']

    return weibo_counts, uid_counts
Beispiel #25
0
def cul_key_weibo_time_count(topic, news_topics, start_ts, over_ts, during):
    key_weibo_time_count = {}
    time_dict = {}
    for clusterid, keywords in news_topics.iteritems(
    ):  #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']}
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        for i in range(interval, 0, -1):  #时间段取每900秒的

            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            must_list = []
            must_list.append(
                {'range': {
                    'timestamp': {
                        'gte': begin_ts,
                        'lt': end_ts
                    }
                }})
            temp = []
            for word in keywords:
                sentence = {"wildcard": {"keywords_string": "*" + word + "*"}}
                temp.append(sentence)
            must_list.append({'bool': {'should': temp}})

            query_body = {"query": {"bool": {"must": must_list}}}
            key_weibo = weibo_es.search(index=topic,
                                        doc_type=weibo_index_type,
                                        body=query_body)
            key_weibo_count = key_weibo['hits']['total']  #分时间段的类的数量
            time_dict[end_ts] = key_weibo_count
        key_weibo_time_count[clusterid] = time_dict
        return key_weibo_time_count
def get_first_node(topic, start_ts, end_ts, windowsize, date):
    '''
    根据timestamp,获取top20的用户----微博可能就不只20条了
    根据微博获取对应的用户信息------可能会出现用户重复的情况,这里只取时间最早的那一个
    将其保存
    '''
    if topic and topic != '':
        # topic = topic.encode('utf-8')
        print topic
        # datestr = start_date.replace('-','')
        # xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id)
        query_body = {
            'query': {
                'bool': {
                    'should': [{
                        'term': {
                            'message_type': 1
                        }
                    }, {
                        'term': {
                            'message_type': 3
                        }
                    }],
                    'must':
                    # {'term':{'name': topic}},
                    {
                        'range': {
                            'timestamp': {
                                'gte': start_ts,
                                'lt': end_ts
                            }
                        }
                    }
                }
            },
            'size': 1000,  # 返回条数限制 待删
            'sort': {
                "timestamp": {
                    "order": "asc"
                }
            }
        }
        es_search_weibos = weibo_es.search(index=topic,
                                           doc_type=weibo_index_type,
                                           body=query_body)['hits']['hits']
        #print es_search_weibos
        user_list = []
        time_top_nodes = es_search_weibos
        if not time_top_nodes:
            print
            'search error'
        else:
            # print 'time_top_nodes:', time_top_nodes
            s = 0
            domain_count_list, domain_user_list = init_domain_list()
            print 'start_node:'
            # print time_top_nodes[1]

            uid_package = []
            for node in time_top_nodes:
                # print 'node:', node
                node = node['_source']
                uid = node['uid']
                #print uid
                #user_domain = uid2domain(uid, topic)    #传入topic用于获取用户关于某一话题的全部微博
                #print user_domain
                timestamp = node['timestamp']
                #print timestamp
                if not uid in uid_package:
                    uid_package.append(uid)
                else:
                    continue

                #print 'start geting user info'
                user_info = get_user_info(uid)  # 获取top_time微博对应的用户信息
                #print 'end geting user info'
                user_weibos = get_topicweibo_byid(uid, topic)
                save_first_nodes(topic, date, windowsize, uid, timestamp,
                                 user_info, user_weibos)
Beispiel #27
0
def get_pushers(topic, new_peaks, new_bottom, ts_list):
    #unit = 900
    #p_during = Hour
    p_ts_list = []
    results = []
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    print 'pusher_start_ts:', ts2date(begin_ts)
    print 'pusher_end_ts:', ts2date(
        end_ts)  #有两个时间  起点和终点  波峰和波谷  现在搞反了  不知道为什么
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    interval = (end_ts - begin_ts) / p_during
    print end_ts - begin_ts
    print p_during
    print interval
    for i in range(interval, 0, -1):
        begin_ts = end_ts - p_during * i
        over_ts = begin_ts + p_during
        #print '383',begin_ts,over_ts
        p_ts_list.append(over_ts)
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()

        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))
    #print 'pusher_line:', results
    #try:
    print results
    print p_ts_list
    try:
        max_k_timestamp = get_max_k_timestamp(results, p_ts_list)  # 获取增速最快的时间点
    except:
        max_k_timestamp = end_ts
    #save max_k_timestamp
    # save_mak_k(max_k_timestamp)
    end = max_k_timestamp
    start = max_k_timestamp - p_during
    query_body = {
        'query': {
            'bool': {
                'must':
                # {'term':{'name': topic}},
                {
                    'range': {
                        'timestamp': {
                            'gte': end,
                            'lt': end + 3600
                        }  #3600
                    }
                }
            }
        },
        'size': 1000000,  # 返回条数限制 待删
        'sort': {
            "timestamp": {
                "order": "asc"
            }
        }
    }
    es_search_weibos = weibo_es.search(index=topic,
                                       doc_type=weibo_index_type,
                                       body=query_body)['hits']['hits']
    #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    #query_dict = {
    #'timestamp':{'$gt':end, '$lt':end+3600}
    #}
    #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户
    #results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count'])

    results = es_search_weibos
    print 'pusher_search_count:', len(results)
    #print 'pusher_query_dict:', query_dict
    pusher_list = []
    count = 0
    for result in results:
        count += 1
        if count > 100:
            break
        wid = result['_source']['mid']
        uid = result['_source']['uid']
        value = result['_source']['retweeted']
        pusher_list.append((uid, wid, value))
    # sort by reposts_count
    # sort_by_rc(pusher_list)
    return pusher_list
Beispiel #28
0
def sort_makers(keyword_data, begin_ts, end_ts, ts_list, topic):

    begin_ts = begin_ts - Hour
    #query_dict = {'timestamp':{'$gt': begin_ts, '$lt': end_ts}}
    print '323', begin_ts, end_ts, topic
    query_body = {
        'query': {
            'bool': {
                'should': [
                    {
                        'term': {
                            'message_type': 1
                        }
                    },
                    {
                        'term': {
                            'message_type': 3  #不确定 待删
                        }
                    }
                ],
                'must':
                # {'term':{'name': topic}},
                {
                    'range': {
                        'timestamp': {
                            'gte': begin_ts,
                            'lt': end_ts
                        }
                    }
                }
            }
        },
        'size': 1000000,  # 返回条数限制 待删
        'sort': {
            "timestamp": {
                "order": "asc"
            }
        }
    }
    es_search_weibos = weibo_es.search(index=topic,
                                       doc_type=weibo_index_type,
                                       body=query_body)['hits']['hits']
    num = 0
    print 'len(es_search_weibos):', len(es_search_weibos)
    if len(es_search_weibos) == 0:
        return []
    weibo_term = {}
    #print es_search_weibos
    for weibo in es_search_weibos:
        #print weibo
        num += 1
        if num > fu_tr_top_keyword:
            break
        uid = weibo['_source']['uid']
        wid = weibo['_source']['mid']
        #terms_list = weibo['_source']['terms']
        terms_dict = json.loads(weibo['_source']['keywords_dict'])
        #print '****************', type(terms_dict)
        key_term_count = []
        key_term = []
        for item in terms_dict:
            key_term.append(item)
            key_term_count.append(terms_dict[item])
        weibo_term[uid] = [wid, key_term_count, key_term]
    sort_weibo_term = sorted(weibo_term.items(),
                             key=lambda x: x[1][1],
                             reverse=True)
    return sort_weibo_term[:fu_tr_top_keyword]
Beispiel #29
0
def cityTopic(topic,
              start_ts,
              over_ts,
              during=Fifteenminutes,
              n_limit=TOP_WEIBOS_LIMIT):
    if topic and topic != '':
        #start_ts = int(start_ts)
        #over_ts = int(over_ts)

        #over_ts = ts2HourlyTime(over_ts, during)
        #interval = (over_ts - start_ts) / during

        geo_cityTopic_results = {}
        geo_cityTopic_results['geo_weibos'] = {}
        geo_cityTopic_results['geo_cityCount'] = {}

        province_dict = {}

        for k, v in mtype_kv.iteritems():  #v代表转发、评论、原创

            first_item = {}

            query_body = {  #按message_type得到微博
                'query': {
                    'bool': {
                        'must': [{
                            'term': {
                                'message_type': v
                            }
                        }, {
                            'range': {
                                'timestamp': {
                                    'gte': start_ts,
                                    'lt': over_ts
                                }
                            }
                        }]
                    }
                },
                'sort': {
                    SORT_FIELD: {
                        "order": "desc"
                    }
                },
                'size': 10000000
            }
            mtype_weibo = weibo_es.search(index=topic,
                                          doc_type=weibo_index_type,
                                          body=query_body)['hits']['hits']
            #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo)
            #微博直接保存下来
            if len(mtype_weibo) == 0:
                continue
            first_item = mtype_weibo[0]['_source']
            #数每个地方的不同类型的数量
            count_i = 0
            for weibo in mtype_weibo:  #对于每条微博
                count_i += 1
                try:
                    geo = weibo['_source']['geo'].encode('utf8')
                except:
                    continue
                #print geo,type(geo)
                province, city = split_city(geo)
                #print province,city
                if count_i <= n_limit:
                    try:
                        geo_cityTopic_results['geo_weibos'][v].append(
                            [province, city, weibo])
                    except:
                        geo_cityTopic_results['geo_weibos'][v] = [[
                            province, city, weibo
                        ]]

                if province != 'unknown':
                    try:
                        province_dict[province][city] += 1
                    except:
                        try:

                            province_dict[province][city] = 1
                        except:
                            province_dict[province] = {city: 1}

                    try:
                        province_dict[province]['total'] += 1
                    except:
                        try:
                            province_dict[province]['total'] = 1
                        except:
                            province_dict[province] = {'total': 1}

            geo_cityTopic_results['geo_cityCount'][v] = province_dict

        return geo_cityTopic_results
Beispiel #30
0
def get_interval_count(topic, date, windowsize):

    index_name = index_event_analysis_results
    index_type = type_event_analysis_results
    results = []
    ts_list = []
    start_date = ts2datetime(datetime2ts(date) - windowsize * Day)
    unit = 900
    print 'start_date:', start_date
    start_ts = datetime2ts(start_date)
    ts_list = [start_ts]
    end_ts = datetime2ts(date)
    interval = (end_ts - start_ts) / during
    print 'interval:', interval
    print topic
    '''
    if MYSQL_TOPIC_LEN == 0:
    	topic0 = topic[:20]
    else:
        topic0=topic
    '''
    for i in range(interval, 0, -1):
        #print 'i:', i
        begin_ts = long(end_ts) - during * i
        over_ts = begin_ts + during
        #print 'begin_ts:', begin_ts#ts2date(begin_ts)
        #print 'over_ts:', over_ts#ts2date(over_ts)
        ts_list.append(over_ts)
        '''
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic0 ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()
                                                        #).all()
        '''
        '''
        query_body = {
            'query':{
                'bool':{
                    'must':[
                        {'range':{'end_ts':{'gt':begin_ts,'lte':over_ts}}},
                        {'term':{'en_name':topic0}},
                        {'term':{'range':unit}}
                    ]
                }
            },
            'size': 1000000  # 返回条数限制 待删
        }

        items = weibo_es.search(index=index_name,doc_type=index_type,body=query_body)['hits']['hits']
        
        '''
        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'en_name': topic
                        }
                    }]
                }
            },
            'size': 1000000
        }

        es_results = weibo_es.search(index=index_name,
                                     doc_type=index_type,
                                     body=query_body)['hits']['hits']
        #print 'results::::::::::',results
        print 'len_results:::::::::::', len(es_results)
        count = 0
        for result in es_results:
            result = result['_source']
            time_results = json.loads(result['time_results'])
            count_results = time_results['count']
            print 'type_time_results:::::::', type(time_results)
            time_time = time_results.keys()
            print 'time_results.keys:::::', time_time.sort()
            #print 'time_results.keys:::::',len(time_time.sort())

            if time_results['during'] == unit:
                print 'count_results.keys()::::;', count_results.keys()
                for end_ts_count in count_results.keys():

                    if end_ts_count > begin_ts and end_ts_count <= over_ts:
                        count += 1
        '''
        if items:
            result = len(items)
        else:
            result = 0
        results.append(float(result))
        '''
        '''
        if count:
            result = count
        else:
            result = 0
        '''
        results.append(float(count))
        print 'results::::::::::', results
        #print abababa
    print 'detect_peak_bottom_line::::::', results
    new_zeros = detect_peaks(results)  # 返回峰值出现的时间区间的序号
    new_bottom = detect_bottom(results)  # get the first bottom
    print 'new_zeros:::::::::::::::::', new_zeros
    print 'new_bottom::::::::::::::::', new_bottom
    print 'ts_list:::::::::::::::::::', ts_list
    # 存趋势时间范围
    # save_peak_bottom(new_zeros, new_bottom)
    #trend_maker = get_makers(topic, new_zeros, new_bottom, ts_list, topic_xapian_id)
    trend_maker = get_makers(topic, new_zeros, new_bottom, ts_list)
    print 'trend_makers:', trend_maker
    trend_pusher = get_pushers(topic, new_zeros, new_bottom, ts_list)
    print 'trend_pushers:', trend_pusher
    #save_trend_maker(topic, date, windowsize, trend_maker)
    maker_results = save_trend_maker_es(topic, date, windowsize, trend_maker)
    #save_trend_pusher(topic, date, windowsize, trend_pusher)
    pusher_results = save_trend_pusher_es(topic, date, windowsize,
                                          trend_pusher)

    return maker_results, pusher_results
Beispiel #31
0
def get_pushers(topic, new_peaks, new_bottom, ts_list):
    #unit = 900
    #p_during = Hour
    counts_dict = {}
    p_ts_list = []
    results = []
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    print 'pusher_start_ts:', ts2date(begin_ts)
    print 'pusher_end_ts:', ts2date(
        end_ts)  #有两个时间  起点和终点  波峰和波谷  现在搞反了  不知道为什么
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    interval = (end_ts - begin_ts) / p_during
    print end_ts - begin_ts
    print p_during
    print interval
    for i in range(interval, 0, -1):
        begin_ts = end_ts - p_during * i
        over_ts = begin_ts + p_during
        #print '383',begin_ts,over_ts
        p_ts_list.append(over_ts)
        '''
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()
        '''
        index_name = index_event_analysis_results
        index_type = type_event_analysis_results
        '''
        query_body = {
            'query':{
                'bool':{
                    'must':[
                        {'range':{'end_ts':{'gt':begin_ts,'lte':over_ts}}},
                        {'term':{'en_name':topic}},
                        {'term':{'range':unit}}
                    ]
                }
            },
            'size': 1000000  # 返回条数限制 待删
        }
        '''
        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'en_name': topic
                        }
                    }]
                }
            },
            'size': 1000000  # 返回条数限制 待删
        }

        items = weibo_es.search(index=index_name,
                                doc_type=index_type,
                                body=query_body)['hits']['hits']

        if items:
            for item in items:
                #kcount_dict = parseKcount(item['_source']['kcount'])
                #time_results = json.loads(item['_source']['time_results'])
                time_results = item['_source']['time_results']
                count_items = time_results['count']

                for count_item in count_items:
                    for k, v in count_item.iteritems():
                        if k > begin_ts and k <= end_ts:
                            during = v['during']
                            if during == unit:
                                for key, value in v.iteritems():
                                    if key not in ['during']:
                                        try:
                                            counts_dict[key] += value
                                        except KeyError:
                                            counts_dict[key] = value
                '''
                range = count_items['range']    
                if range == unit:
                    for k,v in count_items.iteritems():
                        if k not in ['range']:
                            if k > begin_ts and k <= end_ts:
                                for key,value in count_items[k].iteritems():
                                    try:
                                        counts_dict[key] += value
                                    except KeyError:
                                        counts_dict[key] = value
                '''

        #items = weibo_es.search(index=index_name,doc_type=index_type,body=query_body)['hits']['hits']
        '''
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        '''
        for k, v in counts_dict.iteritems():
            result += v

        results.append(float(result))
    #print 'pusher_line:', results
    #try:
    print results
    print p_ts_list
    try:
        max_k_timestamp = get_max_k_timestamp(results, p_ts_list)  # 获取增速最快的时间点
    except:
        max_k_timestamp = end_ts
    #save max_k_timestamp
    # save_mak_k(max_k_timestamp)
    end = max_k_timestamp
    start = max_k_timestamp - p_during
    query_body = {
        'query': {
            'bool': {
                'must':
                # {'term':{'name': topic}},
                {
                    'range': {
                        'timestamp': {
                            'gte': end,
                            'lt': end + 3600
                        }  #3600
                    }
                }
            }
        },
        'size': 1000000,  # 返回条数限制 待删
        'sort': {
            "timestamp": {
                "order": "asc"
            }
        }
    }
    es_search_weibos = weibo_es.search(index=topic,
                                       doc_type=weibo_index_type,
                                       body=query_body)['hits']['hits']
    #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    #query_dict = {
    #'timestamp':{'$gt':end, '$lt':end+3600}
    #}
    #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户
    #results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count'])

    results = es_search_weibos
    print 'pusher_search_count:', len(results)
    #print 'pusher_query_dict:', query_dict
    pusher_list = []
    count = 0
    for result in results:
        count += 1
        if count > 100:
            break
        wid = result['_source']['mid']
        uid = result['_source']['uid']
        value = result['_source']['retweeted']
        pusher_list.append((uid, wid, value))
    # sort by reposts_count
    # sort_by_rc(pusher_list)
    return pusher_list
Beispiel #32
0
def get_keyword(topic, begin_ts, end_ts, top):
    kcounts_dict = {}
    #unit = 900 # PropagateKeywords unit=900
    #limit = 50
    limit = fu_tr_top_keyword
    '''
    if MYSQL_TOPIC_LEN == 0:
    	topic=topic[:20]
    '''
    #print 'get_keywords begin_ts:', begin_ts
    #print 'get_keywords end_ts:', end_ts
    print topic, unit, limit
    '''
    items = db.session.query(PropagateKeywords).filter(PropagateKeywords.end>begin_ts ,\
                                                       PropagateKeywords.end<=end_ts ,\
                                                       PropagateKeywords.topic==topic ,\
                                                       PropagateKeywords.range==unit ,\
                                                       PropagateKeywords.limit==limit).all()
    '''
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results
    '''
    query_body = {
        'query':{
            'bool':{
                'must':[
                    {'range':{'end_ts':{'gt':begin_ts,'lte':end_ts}}},
                    {'term':{'en_name':topic}},
                    {'term':{'range':unit}},
                    {'term':{'limit':limit}}
                ]
            }
        },
        'size': 1000000  # 返回条数限制 待删
    }
    '''
    query_body = {
        'query': {
            'bool': {
                'must': [{
                    'term': {
                        'en_name': topic
                    }
                }]
            }
        },
        'size': 1000000  # 返回条数限制 待删
    }

    items = weibo_es.search(index=index_name,
                            doc_type=index_type,
                            body=query_body)['hits']['hits']

    if items:
        for item in items:
            #kcount_dict = parseKcount(item['_source']['kcount'])
            print 'time_results::::::', type(item['_source']['time_results'])
            #time_results = json.loads(item['_source']['time_results'])
            time_results = item['_source']['time_results']
            time_results = json.loads(time_results)
            print 'time_results::::::', type(time_results)
            kcount_items = time_results['kcount']
            during = time_results['during']
            k_limit = time_results['k_limit']

            print 'type(kcount_items)::::::', type(kcount_items)

            for key, kcount_item in kcount_items.iteritems():
                #print 'kcount_item::::::::::::::::::',kcount_item
                #print 'type(kcount_item)::::::::',type(kcount_item)
                #for k,v in kcount_item.iteritems():
                if key > begin_ts and key <= end_ts:
                    #during = v['during']
                    #limit = v['limit']
                    if during == unit and k_limit == limit:
                        for k, v in kcount_item.iteritems():
                            #if key not in ['during','limit']:
                            try:
                                kcounts_dict[k] += v
                            except KeyError:
                                kcounts_dict[k] = v
            '''
            for kcount_item in kcount_items:
                print 'kcount_item::::::::::::::::::',kcount_item
                print 'type(kcount_item)::::::::',type(kcount_item)
                for k,v in kcount_item.iteritems():
                    if k > begin_ts and k <= end_ts:
                        #during = v['during']
                        #limit = v['limit']
                        if during == unit and k_limit == limit:
                            for key,value in v.iteritems():
                                #if key not in ['during','limit']:
                                try:
                                    kcounts_dict[key] += value
                                except KeyError:
                                    kcounts_dict[key] = value 
            '''
            '''
            range = kcount_items['range']
            limit = kcount_items['limit']
            if range == unit and limit == limit:
                for k,v in kcount_items.iteritems():
                    if k not in ['range','limit']:
                        if k > begin_ts and k <= end_ts:
                            for key,value in kcount_items[k].iteritems():
                                try:
                                    kcounts_dict[key] += value
                                except KeyError:
                                    kcounts_dict[key] = value
           
                for k,v in kcount_dict.iteritems():
                    try:
                        kcounts_dict[k] += v
                    except KeyError:
                        kcounts_dict[k] = v
            '''
        keyword_data = _top_keywords(kcounts_dict, top)
    else:
        keyword_data = []

    return keyword_data