def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data( filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data( '/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def seg_filter_txt(filepath, storepath): txtfile = open(filepath, 'r') txtdata = txtfile.readlines() txtfile.close() review_data = tp.segmentation(txtdata[0], 'list') #stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r') stopfile = open( 'E:/GraduationProject/pythoncode/project/Prediction/main/PreprocessingModule/stopword.txt', 'r') stopdata1 = stopfile.readlines() stopdata2 = ''.join(stopdata1) stopwords = stopdata2.decode('utf8').split('\n') stopfile.close() seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] fil_file = open(storepath, 'w') for word in seg_fil_result: words = ''.join(word) fil_file.write(words.encode('utf8') + ' ') # fil_file.write(word.encode('utf8')+' ') fil_file.close()
def seg_filter_txt(filepath, storepath): txtfile = open(filepath, 'r') txtdata = txtfile.readlines() txtfile.close() review_data = tp.segmentation(txtdata[0], 'list') stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r') stopdata1 = stopfile.readlines() stopdata2 = ''.join(stopdata1) stopwords = stopdata2.decode('utf8').split('\n') stopfile.close() seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] fil_file = open(storepath, 'w') for word in seg_fil_result: fil_file.write(word.encode('utf8') + ' ') fil_file.close()
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data( filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords """ stopwords = tp.get_txt_data('D:/code/stopword.txt', 'lines') """ stopwords = tp.get_txt_data( 'E:/GraduationProject/pythoncode/project/Prediction/main/PreprocessingModule/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def cut_sentences_words(self, review): sent_words = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') #seg_sent = self.stopWordFilter(seg_sent) sent_words.append(seg_sent) return sent_words
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) #将每条评论进行分句,得到每条评论的 分句 words = tp.segmentation(review, 'list') #将每条评论进行分词,保存分词结果到列表数组中 sent_num = len(sents) #记录每条评论分句完后的句子数量 word_num = len(words) #记录每条评论的 分词后的 词数 sent_word = float(word_num) / float(sent_num) #评论的长度 = 分词数/分句数 word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) words = tp.segmentation(review,'list') sent_num = len(sents) word_num = len(words) sent_word = float(word_num)/float(sent_num) # review length = word number/sentence number word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) words = tp.segmentation(review, "list") sent_num = len(sents) word_num = len(words) sent_word = float(word_num) / float(sent_num) # review length = word number/sentence number word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: for sent in review: seg_sent = tp.segmentation(sent, "list") # 分词 i = 0 # word position counter a = 0 # sentiment word position poscount = 0 # count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == "!".decode("utf8") or word == "!".decode("utf8"): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break else: pos = tp.postagger(word, "list") for k in pos: if k[1] == "a": fo.write(word.encode("utf8") + "\n") elif k[1] == "d": fo.write(word.encode("utf8") + "\n") elif k[1] == "v": fo.write(word.encode("utf8") + "\n") elif k[1] == "n": fo.write(word.encode("utf8") + "\n") i += 1 single_review_count.append(transform_to_positive_num(poscount, negcount)) # 评论分句子后,每条句子的正负权值 all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] single_review_count = [] return all_review_count
def single_review_sentiment_score(weibo_sent): single_review_senti_score = [] cuted_review = tp.cut_sentence(weibo_sent) # 句子切分,单独对每个句子进行分析 for sent in cuted_review: seg_sent = tp.segmentation(sent) # 分词 seg_sent = tp.del_stopwords(seg_sent)[:] #for w in seg_sent: # print w, i = 0 # 记录扫描到的词的位置 s = 0 # 记录情感词的位置 poscount = 0 # 记录该分句中的积极情感得分 negcount = 0 # 记录该分句中的消极情感得分 for word in seg_sent: # 逐词分析 #print word if word in posdict: # 如果是积极情感词 #print "posword:", word poscount += 1 # 积极得分+1 for w in seg_sent[s:i]: poscount = match(w, poscount) #print "poscount:", poscount s = i + 1 # 记录情感词的位置变化 elif word in negdict: # 如果是消极情感词 #print "negword:", word negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) #print "negcount:", negcount s = i + 1 # 如果是感叹号,表示已经到本句句尾 elif word == "!".decode("utf-8") or word == "!".decode('utf-8'): for w2 in seg_sent[::-1]: # 倒序扫描感叹号前的情感词,发现后权值+2,然后退出循环 if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 #print "poscount,negcount", poscount, negcount single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) # 对得分做最后处理 pos_result, neg_result = 0, 0 # 分别记录积极情感总得分和消极情感总得分 for res1, res2 in single_review_senti_score: # 每个分句循环累加 pos_result += res1 neg_result += res2 #print pos_result, neg_result result = pos_result - neg_result # 该条微博情感的最终得分 result = round(result, 1) return result
def get_blog_features(): import tp_utility as tpu import textprocessing as tp import MySQLdb #get blog df_content mysql_cn= MySQLdb.connect('127.0.0.1','root','100811','mydb',charset='utf8') df_content = pd.read_sql("select * from weibo_bloginfor limit 5000;", con=mysql_cn) mysql_cn.close() #get features in each blog blogs= df_content['mc'].values moto = [tp.segmentation(blog) for blog in blogs] moto_features = extract_features(moto) return moto_features,df_content
def mysentiment_score_list(oneblog): cuted_data = [] for sen in tp.cut_sentence(oneblog): cuted_data.append(sen) blog_score_list = [] for sent in cuted_data: #循环遍历评论中的每一个分句 segtmp = tp.segmentation(sent) #print segtmp pos_count = 0 neg_count = 0 for word in segtmp: if word in posdict: pos_count += 1 elif word in negdict: neg_count += 1 blog_score_list.append([pos_count, neg_count]) return blog_score_list
def mysentiment_score_list(oneblog): cuted_data = [] for sen in tp.cut_sentence(oneblog): cuted_data.append(sen) blog_score_list = [] for sent in cuted_data: #循环遍历评论中的每一个分句 segtmp = tp.segmentation(sent) #print segtmp pos_count = 0 neg_count = 0 for word in segtmp: if word in posdict: pos_count +=1 elif word in negdict: neg_count +=1 blog_score_list.append([pos_count,neg_count]) return blog_score_list
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: for sent in review: seg_sent = tp.segmentation(sent, 'list') i = 0 #word position counter a = 0 #sentiment word position poscount = 0 #count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == '!'.decode('utf8') or word == '!'.decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_count.append( transform_to_positive_num( poscount, negcount)) #[[s1_score], [s2_score], ...] all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] single_review_count = [] return all_review_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: for sent in review: seg_sent = tp.segmentation(sent, "list") i = 0 # word position counter a = 0 # sentiment word position poscount = 0 # count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == "!".decode("utf8") or word == "!".decode("utf8"): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_count.append(transform_to_positive_num(poscount, negcount)) # [[s1_score], [s2_score], ...] all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] single_review_count = [] return all_review_count
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data('/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != ' '] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def single_review_sentiment_score(review): # 可以计算每条评论中,正负指数。 single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) # cut sentence for sent in cuted_review: seg_sent = tp.segmentation(sent, "list") # cut word i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) # 句子中每句话正权值 a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) # 句子中每句话负权值 a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode("utf8") or word == "!".decode("utf8"): # 中英文!号 for w2 in seg_sent[::-1]: # 截取!号前的一个词 if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break else: fo.write(word) i += 1 # poscount,negcount 评论中每个句子的正负权值 single_review_senti_score.append(transform_to_positive_num(poscount, negcount)) review_sentiment_score = sumup_sentence_sentiment_score(single_review_senti_score) return review_sentiment_score
def single_review_sentiment_score(review): single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) review_sentiment_score = sumup_sentence_sentiment_score( single_review_senti_score) return review_sentiment_score
def single_review_sentiment_score(review): single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append(transform_to_positive_num(poscount, negcount)) review_sentiment_score = sumup_sentence_sentiment_score(single_review_senti_score) return review_sentiment_score
def seg_filter_txt(filepath, storepath): txtfile = open(filepath, "r") txtdata = txtfile.readlines() txtfile.close() review_data = tp.segmentation(txtdata[0], "list") stopfile = open("/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt", "r") stopdata1 = stopfile.readlines() stopdata2 = "".join(stopdata1) stopwords = stopdata2.decode("utf8").split("\n") stopfile.close() seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != " "] seg_fil_result.append(fil) fil = [] fil_file = open(storepath, "w") for word in seg_fil_result: fil_file.write(word.encode("utf8") + " ") fil_file.close()
def seg_filter_txt(filepath, storepath): txtfile = open(filepath, 'r') txtdata = txtfile.readlines() txtfile.close() review_data = tp.segmentation(txtdata[0], 'list') stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r') stopdata1 = stopfile.readlines() stopdata2 = ''.join(stopdata1) stopwords = stopdata2.decode('utf8').split('\n') stopfile.close() seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != ' '] seg_fil_result.append(fil) fil = [] fil_file = open(storepath, 'w') for word in seg_fil_result: fil_file.write(word.encode('utf8')+' ') fil_file.close()
def get_single_sent_count(cuted_sents): single_review_senti_score = [] for sent in cuted_sents: seg_sent = tp.segmentation(sent, 'list') i = 0 # word position counter a = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word #match 用于表示程度 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) return single_review_senti_score
posdict = tp.get_txt_data( "/home/hadoop/coding/Sentiment features/Sentiment dictionary features/sentiment dictionary/positive and negative dictionary/posdict.txt", "lines") negdict = tp.get_txt_data( "/home/hadoop/coding/Sentiment features/Sentiment dictionary features/sentiment dictionary/positive and negative dictionary/negdict.txt", "lines") stopwords = tp.get_txt_data('/home/hadoop/coding/stopword.txt', 'lines') posdict.extend(negdict) i=0 sen_cur=[] p_center = open("/home/hadoop/建国大业客观性.txt",'w+') for sig_re in sentiment_review: #sig_re='挺棒' flag=False seg_list = tp.segmentation(sig_re, 'list') for w in seg_list: if w in posdict: sen_cur.append(sig_re) #主观句 flag=True break if(flag==False): seg_lists=Seg(str(sig_re))#分词 for w in seg_lists: if w in posdict: i+=1 sen_cur.append(sig_re) #主观句 print w,'\t',sig_re flag=True break
def sentiment_score_list(dataset): cuted_data=[] for cell in dataset: cuted_data.append(tp.cut_sentence_2(cell)) count1=[] count2=[] for sents in cuted_data: # 遍历每一个评论 for sent in sents: # 循环遍历评论中的每一个分句 segtmp=tp.segmentation(sent,'list') # 把句子分词,以列表返回 i=0 a=0 poscount=0 poscount2 = 0 poscount3 = 0 negcount=0 negcount2 = 0 negcount3 = 0 for word in segtmp: if word in posdict: poscount+=1 c=0 for w in segtmp[a:i]: if w in mostdict: poscount*=4.0 elif w in verydict: poscount*=3.0 elif w in moredict: poscount*=2.0 elif w in ishdict: poscount/=2.0 elif w in insufficientdict: poscount/=4.0 elif w in inversedict: c+=1 if judgeodd(c)=='odd': poscount*=-1 poscount2+=poscount poscount=0 poscount3=poscount+poscount2+poscount3 poscount2=0 else: poscount3=poscount+poscount2+poscount3 poscount=0 a=i+1 elif word in negdict: negcount+=1 d=0 for w in segtmp[a:i]: if w in mostdict: negcount*=4.0 elif w in verydict: negcount*=3.0 elif w in moredict: negcount*=2.0 elif w in ishdict: negcount/=2.0 elif w in insufficientdict: negcount/=4.0 elif w in inversedict: d+=1 if judgeodd(d)=='odd': negcount*=-1.0 negcount2+=negcount negcount=0 negcount3=negcount+negcount2+negcount3 negcount2=0 else: negcount3=negcount+negcount2+negcount3 negcount=0 a=i+1 elif word=='|'.decode('utf-8') or word =='!'.decode('utf8'): for w2 in segtmp[::-1]: if w2 in posdict or negdict: poscount3+=2 negcount3+=2 break i+=1 pos_count=0 neg_count=0 if poscount3<0 and negcount3>0: neg_count+=negcount3-poscount3 pos_count=0 elif negcount3<0 and poscount3>0: pos_count=poscount3-negcount3 neg_count=0 elif negcount3<0 and poscount3<0: neg_count=-poscount3 pos_count=-negcount3 else: pos_count=poscount3 neg_count=negcount3 count1.append([pos_count,neg_count]) count2.append(count1) count1=[] return count2
#neg_review=pickle.load(open(pos_neg_review_path+'/'+'neg_review'+'.pkl','r')) """ dimension = ['500','1000','1500','2000','2500','3000'] for d in dimension: word_scores = create_word_scores_bigram() best_words = find_best_words(word_scores, int(d)) posFeatures = pos_features(best_word_features) negFeatures = neg_features(best_word_features) train = posFeatures[174:]+negFeatures[174:] devtest = posFeatures[124:174]+negFeatures[124:174] test = posFeatures[:124]+negFeatures[:124] dev, tag_dev = zip(*devtest) print 'Feature number %f' %d print 'BernoulliNB`s accuracy is %f' %score(BernoulliNB()) print 'MultinomiaNB`s accuracy is %f' %score(MultinomialNB()) print 'LogisticRegression`s accuracy is %f' %score(LogisticRegression()) print 'SVC`s accuracy is %f' %score(SVC()) print 'LinearSVC`s accuracy is %f' %score(LinearSVC()) print 'NuSVC`s accuracy is %f' %score(NuSVC()) """ file = '/Users/genghaiyang/git/sina_weibo_crawler/data/data10/smallS_pos_neg.csv' blog = pd.read_csv(file, header='infer', sep=',') #pos_review = [tp.segmentation(blog) for blog in blog['mc'][blog.value=='p']] neg_review = [tp.segmentation(blog) for blog in blog['mc'][blog.value=='n']] pos_neg_review_path = '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_blog' #pickle.dump(pos_review, open(pos_neg_review_path+'/'+'pos_review'+'.pkl','wb')) pickle.dump(neg_review, open(pos_neg_review_path+'/'+'neg_review'+'.pkl','wb'))
""" dimension = ['500','1000','1500','2000','2500','3000'] for d in dimension: word_scores = create_word_scores_bigram() best_words = find_best_words(word_scores, int(d)) posFeatures = pos_features(best_word_features) negFeatures = neg_features(best_word_features) train = posFeatures[174:]+negFeatures[174:] devtest = posFeatures[124:174]+negFeatures[124:174] test = posFeatures[:124]+negFeatures[:124] dev, tag_dev = zip(*devtest) print 'Feature number %f' %d print 'BernoulliNB`s accuracy is %f' %score(BernoulliNB()) print 'MultinomiaNB`s accuracy is %f' %score(MultinomialNB()) print 'LogisticRegression`s accuracy is %f' %score(LogisticRegression()) print 'SVC`s accuracy is %f' %score(SVC()) print 'LinearSVC`s accuracy is %f' %score(LinearSVC()) print 'NuSVC`s accuracy is %f' %score(NuSVC()) """ file = '/Users/genghaiyang/git/sina_weibo_crawler/data/data10/smallS_pos_neg.csv' blog = pd.read_csv(file, header='infer', sep=',') #pos_review = [tp.segmentation(blog) for blog in blog['mc'][blog.value=='p']] neg_review = [tp.segmentation(blog) for blog in blog['mc'][blog.value == 'n']] pos_neg_review_path = '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_blog' #pickle.dump(pos_review, open(pos_neg_review_path+'/'+'pos_review'+'.pkl','wb')) pickle.dump(neg_review, open(pos_neg_review_path + '/' + 'neg_review' + '.pkl', 'wb'))
def sentiment_score_list(oneblog): cuted_data = [] for sen in tp.cut_sentence(oneblog): #print sen cuted_data.append(sen) #print 'testing..............' count1 = [] count2 = [] #for sents in cuted_data: #循环遍历每一个评论 for sent in cuted_data: #循环遍历评论中的每一个分句 segtmp = tp.segmentation(sent) #把句子进行分词,以列表的形式返回 #segtmp =list(set(segtmp)) #去除用于的词,如果情感词出现多次,那么会被重复计算 #print segtmp i = 0 #记录扫描到的词的位置 a = 0 #记录情感词的位置 poscount = 0 #积极词的第一次分值 poscount2 = 0 #积极词反转后的分值 poscount3 = 0 #积极词的最后分值(包括叹号的分值) negcount = 0 negcount2 = 0 negcount3 = 0 for word in segtmp: #print word,type(word),'testing...........' if word in posdict: #判断词语是否是情感词 poscount += 1 c = 0 for w in segtmp[a:i]: #扫描情感词前的程度词 if w in mostdict: poscount *= 4.0 elif w in verydict: poscount *= 3.0 elif w in moredict: poscount *= 2.0 elif w in ishdict: poscount /= 2.0 elif w in insufficientdict: poscount /= 4.0 elif w in inversedict: c += 1 if judgeodd(c) == 'odd': #扫描情感词前的否定词数 poscount *= -1.0 poscount2 += poscount poscount = 0 poscount3 = poscount + poscount2 + poscount3 poscount2 = 0 else: poscount3 = poscount + poscount2 + poscount3 poscount = 0 a = i + 1 #情感词的位置变化 elif word in negdict: #消极情感的分析,与上面一致 negcount += 1 d = 0 for w in segtmp[a:i]: if w in mostdict: negcount *= 4.0 elif w in verydict: negcount *= 3.0 elif w in moredict: negcount *= 2.0 elif w in ishdict: negcount /= 2.0 elif w in insufficientdict: negcount /= 4.0 elif w in inversedict: d += 1 if judgeodd(d) == 'odd': negcount *= -1.0 negcount2 += negcount negcount = 0 negcount3 = negcount + negcount2 + negcount3 negcount2 = 0 else: negcount3 = negcount + negcount2 + negcount3 negcount = 0 a = i + 1 elif word == '!'.decode('utf8') or word == '!'.decode( 'utf8'): ##判断句子是否有感叹号 for w2 in segtmp[::-1]: #扫描感叹号前的情感词,发现后权值+2,然后退出循环 if w2 in posdict or negdict: poscount3 += 2 negcount3 += 2 break i += 1 #扫描词位置前移 #print pos_count,neg_count,'testing...................' #以下是防止出现负数的情况 pos_count = 0 neg_count = 0 if poscount3 < 0 and negcount3 > 0: neg_count += negcount3 - poscount3 pos_count = 0 elif negcount3 < 0 and poscount3 > 0: pos_count = poscount3 - negcount3 neg_count = 0 elif poscount3 < 0 and negcount3 < 0: neg_count = -poscount3 pos_count = -negcount3 else: pos_count = poscount3 neg_count = negcount3 count1.append([pos_count, neg_count]) count2.append(count1) count1 = [] return count2
def sentiment_score_list(oneblog): cuted_data = [] for sen in tp.cut_sentence(oneblog): #print sen cuted_data.append(sen) #print 'testing..............' count1 = [] count2 = [] #for sents in cuted_data: #循环遍历每一个评论 for sent in cuted_data: #循环遍历评论中的每一个分句 segtmp = tp.segmentation(sent) #把句子进行分词,以列表的形式返回 #segtmp =list(set(segtmp)) #去除用于的词,如果情感词出现多次,那么会被重复计算 #print segtmp i = 0 #记录扫描到的词的位置 a = 0 #记录情感词的位置 poscount = 0 #积极词的第一次分值 poscount2 = 0 #积极词反转后的分值 poscount3 = 0 #积极词的最后分值(包括叹号的分值) negcount = 0 negcount2 = 0 negcount3 = 0 for word in segtmp: #print word,type(word),'testing...........' if word in posdict: #判断词语是否是情感词 poscount += 1 c = 0 for w in segtmp[a:i]: #扫描情感词前的程度词 if w in mostdict: poscount *= 4.0 elif w in verydict: poscount *= 3.0 elif w in moredict: poscount *= 2.0 elif w in ishdict: poscount /= 2.0 elif w in insufficientdict: poscount /= 4.0 elif w in inversedict: c += 1 if judgeodd(c) == 'odd': #扫描情感词前的否定词数 poscount *= -1.0 poscount2 += poscount poscount = 0 poscount3 = poscount + poscount2 + poscount3 poscount2 = 0 else: poscount3 = poscount + poscount2 + poscount3 poscount = 0 a = i + 1 #情感词的位置变化 elif word in negdict: #消极情感的分析,与上面一致 negcount += 1 d = 0 for w in segtmp[a:i]: if w in mostdict: negcount *= 4.0 elif w in verydict: negcount *= 3.0 elif w in moredict: negcount *= 2.0 elif w in ishdict: negcount /= 2.0 elif w in insufficientdict: negcount /= 4.0 elif w in inversedict: d += 1 if judgeodd(d) == 'odd': negcount *= -1.0 negcount2 += negcount negcount = 0 negcount3 = negcount + negcount2 + negcount3 negcount2 = 0 else: negcount3 = negcount + negcount2 + negcount3 negcount = 0 a = i + 1 elif word == '!'.decode('utf8') or word == '!'.decode('utf8'): ##判断句子是否有感叹号 for w2 in segtmp[::-1]: #扫描感叹号前的情感词,发现后权值+2,然后退出循环 if w2 in posdict or negdict: poscount3 += 2 negcount3 += 2 break i += 1 #扫描词位置前移 #print pos_count,neg_count,'testing...................' #以下是防止出现负数的情况 pos_count = 0 neg_count = 0 if poscount3 < 0 and negcount3 > 0: neg_count += negcount3 - poscount3 pos_count = 0 elif negcount3 < 0 and poscount3 > 0: pos_count = poscount3 - negcount3 neg_count = 0 elif poscount3 < 0 and negcount3 < 0: neg_count = -poscount3 pos_count = -negcount3 else: pos_count = poscount3 neg_count = negcount3 count1.append([pos_count, neg_count]) count2.append(count1) count1 = [] return count2