def separater(user_weibos): #print user_weibos s = load_scws() contents = [] #all_words_dict = {} for user_weibo in user_weibos: content = user_weibo['_source']['text'] print str(content) content = cut_filter(content) content = re_cut(content) separated_words = cut(s, content) words_dict = {} for word in separated_words: print str(word) try: words_dict[word] += 1 except: words_dict[word] = 1 #for item in words_dict: #print str(words_dict[item]) #contents.append(content) #print contents return words_dict
def input_data():#测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb')) for mid,w_text in reader: if uid_weibo.has_key(str(mid)): uid_weibo[str(mid)] = uid_weibo[str(mid)] + '-' + w_text else: uid_weibo[str(mid)] = w_text if mid not in uid_list: uid_list.append(mid) uid_word = dict() for k,v in uid_weibo.items(): words = sw.participle(v) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_word[k] = word_list return uid_list,uid_word
#-*-coding=utf-8-*- import os import sys import json from global_utils_do import STATUS_THRE,FOLLOWER_THRE,labels,outlist,lawyerw,cut,load_scws,adminw,mediaw,businessw s = load_scws() def user_domain_classifier_v2(user): r = user label = labels[11] verified_type = r['verified_type'] location = r['user_location'] province = location.split(' ')[0] followers_count = r['fansnum'] statuses_count = r['statusnum'] name = r['nick_name'] description = r['description'] if verified_type == 4: label = labels[0] # 高校微博 elif verified_type == 1: label = labels[7]#政府机构及人士 elif verified_type == 8 or verified_type == 7 or verified_type == 2: if province not in outlist:
# -*- coding: utf-8 -*- # gathering snmp data from __future__ import division import re import opencc import os from gensim import corpora import cPickle as pickle #from xapian_case.utils import load_scws, cut, load_emotion_words from global_utils_do import load_scws, cut, load_emotion_words AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') cut_str = load_scws() cc = opencc.OpenCC('s2t', opencc_path='/usr/bin/opencc') emotions_words = load_emotion_words() emotions_words = [unicode(e, 'utf-8') for e in emotions_words] t_emotions_words = [cc.convert(e) for e in emotions_words] emotions_words.extend(t_emotions_words) emotions_words = [w.encode('utf-8') for w in emotions_words] emotions_words_set = set(emotions_words) emotion_pattern = re.compile(r'\[(\S+?)\]') def if_emoticoned_weibo(r): # 微博是否包含指定的表情符号集 emotions = re.findall(emotion_pattern, r['text']) is_emoticoned = 1 if set(emotions) & emotions_words_set else 0 return is_emoticoned
# -*- coding: utf-8 -*- import os import scws import time import csv import re from gensim import corpora #from xapian_case.utils import load_scws, cut, cut_filter from global_utils_do import load_scws, cut, cut_filter from liblinearutil import svm_read_problem, load_model, predict, save_model, train sw = load_scws() AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), './') FEATURE_WORD_PATH = os.path.join(AB_PATH, './svm/dictionary_20150124.txt') SVM_MODEL_FILE = os.path.join(AB_PATH, './svm/train.model') TRAIN_DATA_FILE = os.path.join(AB_PATH, './train20150124.csv') TRAIN_INPUT_FILE = os.path.join(AB_PATH, './svm/train20150124.txt') dictionary = corpora.Dictionary.load_from_text(FEATURE_WORD_PATH) def prepare_svm_input_file(texts, dictionary=dictionary): """将svm输入处理成文件 """ pid = os.getpid() svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid) fw = open(svm_input_path, 'w') for text in texts:
return connection def ts2datetime(timestamp): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)) def ts2date(timestamp): return time.strftime('%Y%m%d', time.localtime(timestamp)) def datetime2ts(date): return int(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S'))) s = load_scws() cx_dict = set([ 'Ag', 'a', 'an', 'Ng', 'n', 'nr', 'ns', 'nt', 'nz', 'Vg', 'v', 'vd', 'vn', '@', 'j' ]) # 关键词词性词典 EXTRA_BLACK_LIST_PATH = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'black.txt') def load_black_words(): one_words = set( [line.strip('\r\n') for line in file(EXTRA_BLACK_LIST_PATH)]) return one_words