def save_text2es(): count = 0 bulk_action = [] user_weibo_dict = dict() csvfile = open('./sensitive_uid_text_2.csv', 'rb') reader = csv.reader(csvfile) for line in reader: count += 1 weibo = dict() user = line[0] weibo['text'] = line[1].decode('utf-8', 'ignore') weibo['mid'] = line[2] weibo['geo'] = ip2geo(line[3]) weibo['timestamp'] = line[4] weibo['message_type'] = line[5] weibo['uid'] = user sentiment = attr_liwc([weibo['text']]) weibo['sentiment'] = json.dumps(sentiment) if not isinstance(weibo['text'], str): text = (weibo['text']).encode('utf-8', 'ignore') sw_dict = sensitive_words_extract(text) if sw_dict: weibo['sensitive_words'] = json.dumps(sw_dict) weibo['sensitive'] = 1 else: weibo['sensitive'] = 0 action = {'index':{'_id':weibo['mid']}} bulk_action.extend([action, weibo]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30)
def attri_sensitive_words(weibo_list): sw_results = {} for item in weibo_list: text = item['text'] if not isinstance(text, str): text = text.encode('utf-8', 'ignore') sw_dict = sensitive_words_extract(text) if not sw_dict: continue for key in sw_dict.keys(): if sw_results.has_key(key): sw_results[key] += sw_dict[key] else: sw_results[key] = sw_dict[key] return sw_results
def attri_sensitive_hashtag(weibo_list): sw_hashtag = {} for item in weibo_list: text = item['text'] if not isinstance(text, str): text = text.encode('utf-8', 'ignore') sw_dict = sensitive_words_extract(text) if not sw_dict: continue text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if not hashtag_list: continue for hashtag in hashtag_list: if sw_hashtag.has_key(hashtag): sw_hashtag[hashtag] += 1 else: sw_hashtag[hashtag] = 1 return sw_hashtag
def save_text2es(): count = 0 bulk_action = [] user_weibo_dict = dict() csvfile = open('./sensitive_uid_text_2.csv', 'rb') reader = csv.reader(csvfile) for line in reader: count += 1 weibo = dict() user = line[0] weibo['text'] = line[1].decode('utf-8', 'ignore') weibo['mid'] = line[2] weibo['geo'] = ip2geo(line[3]) weibo['timestamp'] = line[4] weibo['message_type'] = line[5] weibo['uid'] = user sentiment = attr_liwc([weibo['text']]) weibo['sentiment'] = json.dumps(sentiment) if not isinstance(weibo['text'], str): text = (weibo['text']).encode('utf-8', 'ignore') sw_dict = sensitive_words_extract(text) if sw_dict: weibo['sensitive_words'] = json.dumps(sw_dict) weibo['sensitive'] = 1 else: weibo['sensitive'] = 0 action = {'index': {'_id': weibo['mid']}} bulk_action.extend([action, weibo]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30)
patition = 100 number = int(math.ceil(len(temp_list)/float(100))) print number bulk_action = [] for i in range(number): print i uid_list = temp_list[i*patition:(i+1)*patition] for uid in uid_list: query_body["query"]["filtered"]["filter"]["bool"]["should"].append({"term":{"uid": uid}}) es_result = es_flow_text.search(index="flow_text_2013-09-01",doc_type="text", body=query_body)["hits"]["hits"] for item in es_result: source = item["_source"] text = source["text"].encode("utf-8", "ignore") sensitive_dict = sensitive_words_extract(text) sensitive = sensitive_dict if sensitive: source["sensitive"] = 1 source["sensitive_words"] = json.dumps(sensitive_dict) else: source["sensitive"] = 0 _id = source["mid"] action = {"index": {"_id": _id}} bulk_action.extend([action, source]) count += 1 if count % 1000 == 0: es_flow_text.bulk(bulk_action, index="sensitive_user_text_111", doc_type="user") bulk_action = [] print count if bulk_action: