def separater(user_weibos): #print user_weibos s = load_scws() contents = [] #all_words_dict = {} for user_weibo in user_weibos: content = user_weibo['_source']['text'] print str(content) content = cut_filter(content) content = re_cut(content) separated_words = cut(s, content) words_dict = {} for word in separated_words: print str(word) try: words_dict[word] += 1 except: words_dict[word] = 1 #for item in words_dict: #print str(words_dict[item]) #contents.append(content) #print contents return words_dict
def triple_classifier(tweet): """content168 以utf-8编码 """ sentiment = 0 text = tweet['content168'] if '//@' in text: text = text[:text.index('//@')] if not len(text): text = remove_at(tweet['content168']) emoticon_sentiment = emoticon(pe_set,ne_set, text) if emoticon_sentiment in [1,2]: sentiment = 1 text = '' if text != '': entries = cut(sw, text) entry = [e.decode('utf-8') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1,1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] <= s[1]: sentiment = 1 else: sentiment = 0 return sentiment
def cut_words_noun(text): '''分词, 加入黑名单过滤单个词,保留名词 input texts: 输入text的list,utf-8 output: terms: 关键词list ''' if not isinstance(text, str): raise ValueError("cut words input text must be string") cx_terms = cut(s, text, cx=True) return [term for term, cx in cx_terms if cx in cx_dict_noun and term not in black_words]
def prepare_svm_input(texts, y=None, dictionary=dictionary): """处理svm输入 """ x = [] if not y: y = [1.0 for i in range(0, len(texts))] for text in texts: words = cut(sw, text) feature = dictionary.doc2bow(words) x.append(dict(feature)) return y, x
def prepare_svm_input_file(texts, dictionary=dictionary): """将svm输入处理成文件 """ pid = os.getpid() svm_input_path = os.path.join(AB_PATH, './svm_test/%s.txt' % pid) fw = open(svm_input_path, 'w') for text in texts: words = cut(sw, text) feature = dictionary.doc2bow(words) line = '1 ' + ' '.join([ str(wordid + 1) + ':' + str(wordcount) for wordid, wordcount in feature ]) fw.write('%s\n' % line) fw.close() return svm_input_path
def triple_classifier(tweet): """text: utf-8 encoding """ sentiment = 0 text = tweet['text'] # encode #if_empty_retweet = if_empty_retweet_weibo(tweet) #if if_empty_retweet: # text = tweet['retweeted_status']['text'] # if_emoticoned = if_emoticoned_weibo(tweet) # if if_emoticoned == 1: emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: sentiment = emoticon_sentiment text = '' if text != '': entries = cut(cut_str, text) entry = [e.decode('utf-8', 'ignore') for e in entries] bow = dictionary_1.doc2bow(entry) #将其向量化 s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step1_score[pair[0]][1]**pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) #将其向量化 s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step2_score[pair[0]][1]**pair[1]) s[2] = s[2] * (step2_score[pair[0]][2]**pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY elif s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY return sentiment
def user_domain_classifier_v2(user): r = user label = labels[11] verified_type = r['verified_type'] location = r['user_location'] province = location.split(' ')[0] followers_count = r['fansnum'] statuses_count = r['statusnum'] name = r['nick_name'] description = r['description'] if verified_type == 4: label = labels[0] # 高校微博 elif verified_type == 1: label = labels[7]#政府机构及人士 elif verified_type == 8 or verified_type == 7 or verified_type == 2: if province not in outlist: label = labels[1] # 境内机构 else: label = labels[2] # 境外机构 elif verified_type == 3: if location not in outlist: label = labels[3] # 境内媒体 else: label = labels[4] # 境外媒体 elif verified_type == 5 or verified_type == 6: label = labels[5] # 民间组织 elif verified_type == 0: text = name + description kwdlist = cut(s, text) lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw]) # 律师 adminw_weight = sum([1 for keyword in kwdlist if keyword in adminw]) # 政府官员 mediaw_weight = sum([1 for keyword in kwdlist if keyword in mediaw]) # 媒体人士 businessw_weight = sum([1 for keyword in kwdlist if keyword in businessw]) # 商业人士 max_weight = 0 ''' if max_weight < lawyer_weight: max_weight = lawyer_weight label = labels[6] ''' if max_weight < businessw_weight: max_weight = businessw_weight label = labels[12] if max_weight < adminw_weight: max_weight = adminw_weight label = labels[7] if max_weight < mediaw_weight: max_weight = mediaw_weight label = labels[8] if max_weight == 0: label = labels[9] if lawyer_weight!=0: label = labels[6] elif verified_type == 220 or verified_type == 200: label = labels[9] elif verified_type == 400: label = labels[11] else: if followers_count >= FOLLOWER_THRE and statuses_count >= STATUS_THRE: label = labels[10] # 草根 lawyer_weight = 0 text = name + description kwdlist = cut(s, text) lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw]) if lawyer_weight != 0: label = labels[6] return label
def user_domain_classifier_v2(user): r = user label = labels[11] verified_type = r['verified_type'] location = r['user_location'] province = location.split(' ')[0] followers_count = r['fansnum'] statuses_count = r['statusnum'] name = r['nick_name'] description = r['description'] if verified_type == 4: label = labels[0] # 高校微博 elif verified_type == 1: label = labels[7] #政府机构及人士 elif verified_type == 8 or verified_type == 7 or verified_type == 2: if province not in outlist: label = labels[1] # 境内机构 else: label = labels[2] # 境外机构 elif verified_type == 3: if location not in outlist: label = labels[3] # 境内媒体 else: label = labels[4] # 境外媒体 elif verified_type == 5 or verified_type == 6: label = labels[5] # 民间组织 elif verified_type == 0: text = name + description kwdlist = cut(s, text) lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw]) # 律师 adminw_weight = sum([1 for keyword in kwdlist if keyword in adminw]) # 政府官员 mediaw_weight = sum([1 for keyword in kwdlist if keyword in mediaw]) # 媒体人士 businessw_weight = sum( [1 for keyword in kwdlist if keyword in businessw]) # 商业人士 max_weight = 0 ''' if max_weight < lawyer_weight: max_weight = lawyer_weight label = labels[6] ''' if max_weight < businessw_weight: max_weight = businessw_weight label = labels[12] if max_weight < adminw_weight: max_weight = adminw_weight label = labels[7] if max_weight < mediaw_weight: max_weight = mediaw_weight label = labels[8] if max_weight == 0: label = labels[9] if lawyer_weight != 0: label = labels[6] elif verified_type == 220 or verified_type == 200: label = labels[9] elif verified_type == 400: label = labels[11] else: if followers_count >= FOLLOWER_THRE and statuses_count >= STATUS_THRE: label = labels[10] # 草根 lawyer_weight = 0 text = name + description kwdlist = cut(s, text) lawyer_weight = sum([1 for keyword in kwdlist if keyword in lawyerw]) if lawyer_weight != 0: label = labels[6] return label