def unionFewLabelData(labelDataDir,speNameList,dstDataDir): begin=time.clock() dataTypeList=['subObjLabelData.xls','posNegLabelData.xls','eroNorLabelData.xls'] sheetNameList=[['subjective_data','objective_data'],['postive_data','negtive_data'],['erotic_data','normal_data']] posNegDataNum=[] for dataTypePos in range(len(dataTypeList)): posDataList=[] negDataList=[] for name in speNameList: labelDataPath=labelDataDir+'/'+name+dataTypeList[dataTypePos] print labelDataPath curPosData=tp.get_excel_data(labelDataPath,1,1,'data') curNegData=tp.get_excel_data(labelDataPath,2,1,'data') print len(curPosData),len(curNegData) for x in curPosData: posDataList.append(x) for x in curNegData: negDataList.append(x) workbook=xlwt.Workbook(encoding='utf-8') sheetNameOne=workbook.add_sheet(sheetNameList[dataTypePos][0]) sheetNameTwo=workbook.add_sheet(sheetNameList[dataTypePos][1]) print len(posDataList),len(negDataList) posNegDataNum.append(len(posDataList)) posNegDataNum.append(len(negDataList)) for rowPos in range(len(posDataList)): sheetNameOne.write(rowPos,0,posDataList[rowPos]) for rowPos in range(len(negDataList)): sheetNameTwo.write(rowPos,0,negDataList[rowPos]) workbook.save(dstDataDir+'/'+dataTypeList[dataTypePos]) end=time.clock() print 'union label data time is:',end-begin return posNegDataNum
def testLabelDataAcc(): begin = time.clock() '''获得原始数据路径''' reviewDataSetDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' reviewDataSetName = 'posNegLabelData' reviewDataSetFileType = '.xls' dataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType '''获得目标数据路径''' dstSavePath = reviewDataSetDir + '/' + reviewDataSetName + 'BasedDictSentimentScore.txt' '''获得原始数据''' posreview = tp.get_excel_data(dataSetPath, 1, 1, "data") negreview = tp.get_excel_data(dataSetPath, 2, 1, "data") review = posreview + negreview '''得到每句评论[[PosSum, NegSum],[],]''' sentiment_score_list = get_review_set_sentiement_score(review) '''得到每句评论的整体得分''' sentiment_overall_score = get_sentiment_overall_score_to_txt( sentiment_score_list, review, dstSavePath) labelClass = [] for pos in range(len(posreview)): labelClass.append(1) for pos in range(len(negreview)): labelClass.append(0) # for pos in range(len(sentiment_overall_score)): # print sentiment_score_list[pos],sentiment_overall_score[pos],labelClass[pos] print 'sentiment Analyze Based Dictionary Accuracy:', getAccuracy( sentiment_overall_score, labelClass), 'data item num:', len(review)
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data('D:/ReviewHelpfulnessPrediction\PreprocessingModule/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != ' '] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def read_review_set_and_store_score(dataSetDir, dataSetName, dataSetFileType, sheetNum, colNum, dstDir): start = time.clock() dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType dstPath = dstDir + '/' + dataSetName + 'SentiDictFea.txt' review = tp.get_excel_data(dataSetPath, sheetNum, colNum, "data") # for x in review: # print x res = store_sentiment_dictionary_score(review, dstPath) end = time.clock() return res, end - start
def predictDataSentimentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir): reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType oriDataPath=desDir+'/'+reviewDataSetName+'OriData.txt' oriDataFeaPath = desDir + '/' + reviewDataSetName + 'OriFea.txt' preResStorePath=desDir+'/'+reviewDataSetName+'ClassPro.txt' preTagStorePath=desDir+'/'+reviewDataSetName+'ClassTag.txt' start=time.clock() #reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx' #reviewDataSetPath='D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet/pos_review.xlsx' review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据 #将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') #提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath='D:/ReviewHelpfulnessPrediction\BuildedClassifier/'+str(best_classifier)[0:15]+'.pkl' #装载分类器 clf = pickle.load(open(classifierPath)) #分类之预测数据类标签 data_tag=clf.batch_classify(review_feature) p_file = open(preTagStorePath, 'w') for i in data_tag: p_file.write(str(i)+ '\n') p_file.close() #分类之预测数据积极、消极可能性 pred = clf.batch_prob_classify(review_feature) # 记录分类结果 积极可能性 消极可能性 p_file = open(preResStorePath, 'w') reviewCount = 0 for i in pred: reviewCount += 1 p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n') p_file.close() # 记录原始数据 p_file = open(oriDataPath, 'w') for d in review: p_file.write(d.encode('utf-8')+'\n') p_file.close() p_file = open(oriDataFeaPath, 'w') # 记录原始数据特征提取结果 for d in review_feature: for w,b,in d.iteritems(): if type(w) is not types.TupleType: p_file.write(w.encode('utf-8') +'\t') else: for x in w: p_file.write(x.encode('utf-8') + '_') p_file.write('\n') p_file.close() end=time.clock() return reviewCount,end-start
def predictExcelDataSentTagProToExcel(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir): reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType preDataResPath=desDir+'/'+reviewDataSetName+'RawDataTagProFea.xls' start=time.clock() review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据 #将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') #提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl' #装载分类器 clf = pickle.load(open(classifierPath)) dataItemCount=len(sentiment_review) #分类之预测数据类标签 data_tag=clf.batch_classify(review_feature) #分类之预测数据积极、消极可能性 res_pro = clf.batch_prob_classify(review_feature) # 记录分类结果 积极可能性 消极可能性 # 记录原始数据 # 记录原始数据特征提取结果 # for d in review_feature: # for w,b,in d.iteritems(): # p_file.write(w.encode('utf-8') + ' '+str(b)+'\t') # p_file.write('\n') # p_file.close() preResFile=xlwt.Workbook(encoding='utf-8') preResSheet=preResFile.add_sheet('RawDataTagProFea') posProbility=[] for rowPos in range(dataItemCount): preResSheet.write(rowPos,0,review[rowPos])#原始数据 preResSheet.write(rowPos,1,data_tag[rowPos])#类标签 preResSheet.write(rowPos,2,str(res_pro[rowPos].prob('pos')))#积极概率 posProbility.append(res_pro[rowPos].prob('pos')) preResSheet.write(rowPos, 3, str(res_pro[rowPos].prob('neg')))#消极概率 feature='' #feature='_'.join(review_feature[rowPos].keys()) # print type(review_feature[rowPos].keys()), # 特征里面可能出现二元词的情况 for x in review_feature[rowPos].keys(): if type(x) is not nltk.types.TupleType: feature+=x else: feature+='_'.join(x) feature+=' ' preResSheet.write(rowPos, 4, feature)#特征 preResFile.save(preDataResPath) end=time.clock() print 'handle sentences num:', dataItemCount, ' classify time:', end-start return posProbility,preDataResPath,review
def extractFeaPreUnlabelExcelData(rawDataPath,sheetNum,colNum,preResPath): begin=time.clock() '''获取原始数据列表''' unlabedRawData = tp.get_excel_data(rawDataPath, sheetNum, colNum, 'data') '''获取经分词及去停用词处理后的数据列表''' unlabedSegFiltData = tp.seg_fil_excel(rawDataPath, sheetNum, colNum) '''提取数据特征''' dataAllFea = extractAllFea(unlabedRawData, unlabedSegFiltData) '''读取最佳分类器(最佳分类器名字位于D:/ReviewHelpfulnessPrediction\BuildedClassifier/bestClassifierAcc.txt里面)''' bestClassifier = read_best_classifier() print bestClassifier '''装载分类器,预测分类结果''' loadClassifierPreRes(bestClassifier, unlabedRawData, dataAllFea, preResPath) end=time.clock() print 'extract feature and predict data time is:',end-begin,'handle data item num is:',len(unlabedRawData)
def store_adj_adv_v_num_feature(dataSetDir,dataSetName,dataSetFileType,sheetNum,colNum,dstDir): start=time.clock() filepath=dataSetDir+'/'+dataSetName+dataSetFileType storepath=dstDir+'/'+dataSetName+'AdjAdvVFea.txt' data = tp.get_excel_data(filepath,sheetNum,colNum,'data') adj_adv_num = count_adj_adv(data) f = open(storepath,'w') reviewCount=0 for i in adj_adv_num: f.write(str(i[0])+'\t'+str(i[1])+'\t'+str(i[2])+'\n') reviewCount+=1 f.close() end=time.clock() return reviewCount,end-start
def store_word_sent_num_features(dataSetDir, dataSetName, dataSetFileType, sheetNum, colNum, dstDir): start = time.clock() filepath = dataSetDir + '/' + dataSetName + dataSetFileType storepath = dstDir + '/' + dataSetName + 'WordSentNumFea.txt' data = tp.get_excel_data(filepath, sheetNum, colNum, 'data') word_sent_num = word_sent_count(data) # Need initiallized f = open(storepath, 'w') reviewNum = 0 for i in word_sent_num: f.write(str(i[0]) + '\t' + str(i[1]) + '\t' + str(i[2]) + '\n') reviewNum += 1 f.close() end = time.clock() return reviewNum, end - start
def storeReviewSenValue(dataSetDir,dataSetName,dataSetFileType,sheetNum,colNum,dstDir): start=time.clock() dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType dstPath = dstDir + '/' + dataSetName + 'SnowNLPSentiment.txt' reviewSet=tp.get_excel_data(dataSetPath,sheetNum,colNum,'data') reviewSentiment=[] for review in reviewSet: s=SnowNLP(review) reviewSentiment.append(s.sentiments) reviewNum=0 f=open(dstPath,'w') for x in reviewSentiment: f.write(str(x)+'\n') reviewNum+=1 f.close() end=time.clock() return reviewNum,end-start
'lines') moredict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/more.txt', 'lines') ishdict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/ish.txt', 'lines') insufficientdict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/insufficiently.txt', 'lines') inversedict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/inverse.txt', 'lines') '''导入数据集''' review = tp.get_excel_data( "D:/ReviewHelpfulnessPrediction/ReviewSet/HTC_Z710t_review_2013.6.5.xlsx", 1, 4, "data") '''2 基于字典的情感分析 基本功能''' '''匹配程度词并设置权重''' '''parm:word 当前情感词的前面词语 sentiment_value 当前情感词的情感值''' def match(word, sentiment_value): if word in mostdict: sentiment_value *= 2.0 elif word in verydict: sentiment_value *= 1.5 elif word in moredict: sentiment_value *= 1.25 elif word in ishdict: sentiment_value *= 0.5
def read_review_set_and_store_score(dataSetPath, sheetNum, colNum, scoreStorePath): review = tp.get_excel_data(dataSetPath, sheetNum, colNum, "data") store_sentiment_dictionary_score(review, scoreStorePath)
import itertools import sklearn import numpy import scipy from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist #import sklearn # 1. Load data reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx' review = tp.get_excel_data(reviewDataSetPath, 1, 4, "data") sentiment_review = tp.seg_fil_senti_excel( reviewDataSetPath, 1, 4, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier # 计算整个语料里面每个词和双词搭配的信息量 # 以单个词语和出现频率为前5000双词作为特征 # return : # 返回每个词以及得分 ''' return : 第五 1.64131573422 当是 4.8096346704
''' 训练分类器大致过程如下:''' '''1 装载标记数据,数据预处理(分词及去停用词)''' '''2 提取特征(程度词性个数特征、句子个数及词语数量特征、基于词典的情感得分特征、积极消极可能性特征)''' '''3 训练分类器 ''' '''装载数据模块''' posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' posdata = tp.seg_fil_senti_excel( posNegDir + '/posNegLabelData.xls', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) negdata = tp.seg_fil_senti_excel( posNegDir + '/posNegLabelData.xls', 2, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) posRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 1, 1, 'data') negRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 2, 1, 'data') '''特征提取模块的函数''' '''a 提取形容词、副词、动词数量特征''' '''返回 形容词 副词 动词 特征列表[[adjNum,advNum,vNum],[],],其中参数rawData为原始数据列表(未经分词处理)''' '''在处理弹幕数据时,时间性能大致1s可以处理1000条数据(词性标注比较耗时 看看可否优化(tp.postagger(review, 'list')))''' def count_adj_adv_v(rawData): begin = time.clock() adj_adv_num = [] a = 0 d = 0 v = 0 for review in rawData: