Example #1
0
def read_flow_text(flow_text_index_name, current_date):

    #flow_text_index_name = twitter_flow_text_index_name_pre + current_date

    i = 1

    label_count_dict = {}
    content_dict = {}

    print '!!!'

    while True:

        query_body = {
            'query': {
                'bool': {
                    'must': [{
                        'term': {
                            'sensitive': 0
                        }
                    }]
                }
            },
            'size': 1000,
            'from': i * 1000
        }

        # 原创、sensitive为0
        #print '222'
        search_results = es_xnr.search(index=flow_text_index_name,doc_type=twitter_flow_text_index_type,\
          body=query_body)['hits']['hits']

        weibo_list = []

        for result in search_results:
            result = result['_source']
            weibo_list.append(result['text'].encode('utf-8'))

        label_list = triple_classifier_new(weibo_list)

        label_count = Counter(label_list)
        #print '333'
        for j in range(len(search_results)):

            label = label_list[j]

            try:
                if label_count_dict[label] < 20:
                    content_dict[label].append(search_results[j]['_source'])
                    label_count_dict[label] += 1

            except:
                content_dict[label] = [search_results[j]['_source']]

                label_count_dict[label] = 1

        i += 1

        if i % 1000 == 0:
            print 'i...', i
            print 'label_count_dict...', label_count_dict

        # 循环终止条件
        min_label_count = min(label_count_dict, key=label_count_dict.get)
        if label_count_dict[min_label_count] >= 20:
            break
    print 'label_count_dict::', label_count_dict

    for content_label, content_weibo in content_dict.iteritems():
        _id = content_label
        index_name = tw_daily_interest_index_name_pre + '_' + current_date
        tw_daily_inerests_flow_text_mappings(index_name)
        item_dict = {}
        item_dict['timestamp'] = datetime2ts(current_date)
        item_dict['content'] = json.dumps(content_weibo)
        print es_xnr.index(index=index_name,
                           doc_type=tw_daily_interest_index_type,
                           id=_id,
                           body=item_dict)

        print content_label, '====', len(content_weibo)
def read_flow_text(flow_text_index_name,current_date):
	
	#flow_text_index_name = flow_text_index_name_pre + current_date

	i = 0
	
	label_count_dict = {}
	content_dict = {}

	while True:
		
		query_body = {
			'query':{
				'bool':{
					'must':[
						{'term':{'message_type':1}},
						{'term':{'sensitive':0}}
					]
				}
			},
			'size':1000,
			'from':i*1000,
			'sort':{'user_fansnum':{'order':'desc'}}
		}
		# 原创、sensitive为0
		search_results = es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\
				body=query_body)['hits']['hits']

		print es_flow_text,flow_text_index_name
		weibo_list = []
		print 'len..',len(search_results)		
		for result in search_results:
			result = result['_source']
			weibo_list.append(result['text'].encode('utf-8'))

		label_list = triple_classifier_new(weibo_list)

		label_count = Counter(label_list)

		for j in range(len(search_results)):
			
			label = label_list[j]
			search_results[j]['_source']['label'] = label
			try:
				if label_count_dict[label] < 20:
					content_dict[label].append(search_results[j]['_source'])
					label_count_dict[label] += 1

			except:
				content_dict[label] = [search_results[j]['_source']]

				label_count_dict[label] = 1

		i += 1

		print 'i..',i

		# 循环终止条件
		min_label_count = min(label_count_dict, key=label_count_dict.get)
		if label_count_dict[min_label_count] >= 20:
			break
	print 'label_count_dict::',label_count_dict

	for content_label,content_weibo in content_dict.iteritems():
		#_id = content_label
		index_name = daily_interest_index_name_pre +'_'+ current_date
		daily_inerests_flow_text_mappings(index_name)
		#item_dict = {}
		#item_dict['timestamp'] = datetime2ts(current_date)
		#item_dict['content'] = json.dumps(content_weibo)
		for daily_weibo in content_weibo:
			mid = daily_weibo['mid']
			print es_xnr.index(index=index_name,doc_type=daily_interest_index_type,id=mid,body=daily_weibo)
		
		print content_label,'====',len(content_weibo)