def fenci1(comments, storePath, filter=False): stopwords = tp.loadTextWords(os.getcwd() + "\Dictionary\stopword\stopwords.txt") for comment in comments: seg_result = tp.segmentation(comment, 'str') seg_list = tp.segmentation(comment, 'list') if (filter == True): seg_filter = [ word for word in seg_list if word not in stopwords and word != ' ' ] seg_result = " ".join(seg_filter) print(seg_result) with codecs.open(storePath, 'a+', 'utf-8') as f: f.writelines(seg_result + "\n")
def cut_sentences_words(self, review): sent_words = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') #seg_sent = self.stop_word_filter(seg_sent) sent_words.append(seg_sent) return sent_words
def get_single_sent_count(cuted_sents): single_review_senti_score = [] for sent in cuted_sents: seg_sent = tp.segmentation(sent, 'list') i = 0 # word position counter a = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word #match 用于表示程度 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) return single_review_senti_score
def single_review_sentiment_score(weibo_sent): single_review_senti_score = [] all_word = [] cuted_review = tp.cut_sentence(weibo_sent) # 句子切分,单独对每个句子进行分析 for sent in cuted_review: seg_sent = tp.segmentation(sent) # 分词 seg_sent = tp.del_stopwords(seg_sent)[:] #print(seg_sent) i = 0 # 记录扫描到的词的位置 s = 0 # 记录情感词的位置 poscount = 0 # 记录该分句中的积极情感得分 negcount = 0 # 记录该分句中的消极情感得分 mark1_count = 0 mark2_count = 0 for word in seg_sent: # 逐词分析 all_word.append(word) if word in posdict: # 如果是积极情感词 # print "posword:", word poscount += 1 # 积极得分+1 for w in seg_sent[s:i]: poscount = match(w, poscount) # print "poscount:", poscount s = i + 1 # 记录情感词的位置变化 elif word in negdict: # 如果是消极情感词 # print "negword:", word negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) # print "negcount:", negcount s = i + 1 # 如果是感叹号,表示已经到本句句尾 # elif word == "!" : elif word.encode('UTF-8') == "? " or word.encode('UTF-8') == " ?": mark1_count += 1 elif word.encode('UTF-8') == "!" or word.encode('UTF-8') == "!": mark2_count += 1 for w2 in seg_sent[::-1]: # 倒序扫描感叹号前的情感词,发现后权值+2,然后退出循环 if w2 in posdict: poscount += 1 break elif w2 in negdict: negcount += 1 break i += 1 #print (negcount) single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) # 对得分做最后处理 #print("poscount,negcount,?, !", poscount, negcount, mark1_count, mark2_count) #return single_review_senti_score #print ("lenth:%d"%(len(all_word))) su = len(all_word) pos_result, neg_result = 0, 0 # 分别记录积极情感总得分和消极情感总得分 sentlength = len(single_review_senti_score) #wordlength = len(all_word) #print ("该回答共有%d 分句,共有%d 个分词" %(sentlength,wordlength)) #print ("该回答共有%d个词" %wordlength) pos_score = [] neg_score = [] for res1, res2 in single_review_senti_score: # 每个分句循环累加 pos_result += res1 neg_result += res2 pos_score.append(pos_result) neg_score.append(neg_result) # print pos_result, neg_result result1 = (pos_result - neg_result) # 简单计算该语句的得分 result2 = (pos_result + neg_result) try: result = result1 / result2 #利用林乐模型计算语调 tone = round(result, 3) #print (tone) #return result except Exception as e: tone = 0 #return result res = 0 if tone > 0.0: res = 1 elif tone < 0.0: res = 2 #print ("susu:%d xue:%d" %(pos_result,neg_result)) return pos_result, neg_result, tone, su