def unionFewLabelData(labelDataDir,speNameList,dstDataDir):
	begin=time.clock()
	dataTypeList=['subObjLabelData.xls','posNegLabelData.xls','eroNorLabelData.xls']
	sheetNameList=[['subjective_data','objective_data'],['postive_data','negtive_data'],['erotic_data','normal_data']]
	posNegDataNum=[]
	for dataTypePos in range(len(dataTypeList)):
		posDataList=[]
		negDataList=[]
		for name in speNameList:
			labelDataPath=labelDataDir+'/'+name+dataTypeList[dataTypePos]
			print labelDataPath
			curPosData=tp.get_excel_data(labelDataPath,1,1,'data')
			curNegData=tp.get_excel_data(labelDataPath,2,1,'data')
			print len(curPosData),len(curNegData)
			for x in curPosData:
				posDataList.append(x)
			for x in curNegData:
				negDataList.append(x)
		workbook=xlwt.Workbook(encoding='utf-8')
		sheetNameOne=workbook.add_sheet(sheetNameList[dataTypePos][0])
		sheetNameTwo=workbook.add_sheet(sheetNameList[dataTypePos][1])
		print len(posDataList),len(negDataList)
		posNegDataNum.append(len(posDataList))
		posNegDataNum.append(len(negDataList))
		for rowPos in range(len(posDataList)):
			sheetNameOne.write(rowPos,0,posDataList[rowPos])
		for rowPos in range(len(negDataList)):
			sheetNameTwo.write(rowPos,0,negDataList[rowPos])
		workbook.save(dstDataDir+'/'+dataTypeList[dataTypePos])
	end=time.clock()
	print 'union label data time is:',end-begin
	return posNegDataNum
Exemple #2
0
def testLabelDataAcc():
    begin = time.clock()
    '''获得原始数据路径'''
    reviewDataSetDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
    reviewDataSetName = 'posNegLabelData'
    reviewDataSetFileType = '.xls'
    dataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    '''获得目标数据路径'''
    dstSavePath = reviewDataSetDir + '/' + reviewDataSetName + 'BasedDictSentimentScore.txt'
    '''获得原始数据'''
    posreview = tp.get_excel_data(dataSetPath, 1, 1, "data")
    negreview = tp.get_excel_data(dataSetPath, 2, 1, "data")
    review = posreview + negreview
    '''得到每句评论[[PosSum, NegSum],[],]'''
    sentiment_score_list = get_review_set_sentiement_score(review)
    '''得到每句评论的整体得分'''
    sentiment_overall_score = get_sentiment_overall_score_to_txt(
        sentiment_score_list, review, dstSavePath)
    labelClass = []
    for pos in range(len(posreview)):
        labelClass.append(1)
    for pos in range(len(negreview)):
        labelClass.append(0)
    # for pos in range(len(sentiment_overall_score)):
    # 	print sentiment_score_list[pos],sentiment_overall_score[pos],labelClass[pos]
    print 'sentiment Analyze Based Dictionary Accuracy:', getAccuracy(
        sentiment_overall_score, labelClass), 'data item num:', len(review)
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew
    
    # Read txt file contain stopwords
    stopwords = tp.get_txt_data('D:/ReviewHelpfulnessPrediction\PreprocessingModule/stopword.txt', 'lines')
 
    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != ' ']
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
def read_review_set_and_store_score(dataSetDir, dataSetName, dataSetFileType,
                                    sheetNum, colNum, dstDir):
    start = time.clock()
    dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType
    dstPath = dstDir + '/' + dataSetName + 'SentiDictFea.txt'
    review = tp.get_excel_data(dataSetPath, sheetNum, colNum, "data")
    # for x in review:
    # 	print x
    res = store_sentiment_dictionary_score(review, dstPath)
    end = time.clock()
    return res, end - start
def predictDataSentimentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir):
    reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType
    oriDataPath=desDir+'/'+reviewDataSetName+'OriData.txt'
    oriDataFeaPath = desDir + '/' + reviewDataSetName + 'OriFea.txt'
    preResStorePath=desDir+'/'+reviewDataSetName+'ClassPro.txt'
    preTagStorePath=desDir+'/'+reviewDataSetName+'ClassTag.txt'
    start=time.clock()
    #reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx'
    #reviewDataSetPath='D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet/pos_review.xlsx'
    review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据
    #将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    #提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath='D:/ReviewHelpfulnessPrediction\BuildedClassifier/'+str(best_classifier)[0:15]+'.pkl'
    #装载分类器
    clf = pickle.load(open(classifierPath))
    #分类之预测数据类标签
    data_tag=clf.batch_classify(review_feature)
    p_file = open(preTagStorePath, 'w')
    for i in data_tag:
        p_file.write(str(i)+ '\n')
    p_file.close()
    #分类之预测数据积极、消极可能性
    pred = clf.batch_prob_classify(review_feature)
    # 记录分类结果 积极可能性 消极可能性
    p_file = open(preResStorePath, 'w')
    reviewCount = 0
    for i in pred:
        reviewCount += 1
        p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n')
    p_file.close()
    # 记录原始数据
    p_file = open(oriDataPath, 'w')
    for d in review:
        p_file.write(d.encode('utf-8')+'\n')
    p_file.close()
    p_file = open(oriDataFeaPath, 'w')
    # 记录原始数据特征提取结果
    for d in review_feature:
        for w,b,in d.iteritems():
            if type(w) is not types.TupleType:
                p_file.write(w.encode('utf-8') +'\t')
            else:
                for x in w:
                    p_file.write(x.encode('utf-8') + '_')
        p_file.write('\n')
    p_file.close()
    end=time.clock()
    return reviewCount,end-start
def predictExcelDataSentTagProToExcel(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir):
    reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType
    preDataResPath=desDir+'/'+reviewDataSetName+'RawDataTagProFea.xls'
    start=time.clock()
    review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据
    #将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    #提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl'
    #装载分类器
    clf = pickle.load(open(classifierPath))
    dataItemCount=len(sentiment_review)
    #分类之预测数据类标签
    data_tag=clf.batch_classify(review_feature)
    #分类之预测数据积极、消极可能性
    res_pro = clf.batch_prob_classify(review_feature)
    # 记录分类结果 积极可能性 消极可能性
    # 记录原始数据
    # 记录原始数据特征提取结果
    # for d in review_feature:
    #     for w,b,in d.iteritems():
    #         p_file.write(w.encode('utf-8') + ' '+str(b)+'\t')
    #     p_file.write('\n')
    # p_file.close()
    preResFile=xlwt.Workbook(encoding='utf-8')
    preResSheet=preResFile.add_sheet('RawDataTagProFea')
    posProbility=[]
    for rowPos in range(dataItemCount):
        preResSheet.write(rowPos,0,review[rowPos])#原始数据
        preResSheet.write(rowPos,1,data_tag[rowPos])#类标签
        preResSheet.write(rowPos,2,str(res_pro[rowPos].prob('pos')))#积极概率
        posProbility.append(res_pro[rowPos].prob('pos'))
        preResSheet.write(rowPos, 3, str(res_pro[rowPos].prob('neg')))#消极概率
        feature=''
        #feature='_'.join(review_feature[rowPos].keys())
       # print type(review_feature[rowPos].keys()),
        # 特征里面可能出现二元词的情况
        for x in review_feature[rowPos].keys():
            if type(x) is not nltk.types.TupleType:
                feature+=x
            else:
                feature+='_'.join(x)
            feature+=' '
        preResSheet.write(rowPos, 4, feature)#特征
    preResFile.save(preDataResPath)
    end=time.clock()
    print 'handle sentences num:', dataItemCount, ' classify time:', end-start
    return posProbility,preDataResPath,review
def extractFeaPreUnlabelExcelData(rawDataPath,sheetNum,colNum,preResPath):
    begin=time.clock()
    '''获取原始数据列表'''
    unlabedRawData = tp.get_excel_data(rawDataPath, sheetNum, colNum, 'data')
    '''获取经分词及去停用词处理后的数据列表'''
    unlabedSegFiltData = tp.seg_fil_excel(rawDataPath, sheetNum, colNum)
    '''提取数据特征'''
    dataAllFea = extractAllFea(unlabedRawData, unlabedSegFiltData)
    '''读取最佳分类器(最佳分类器名字位于D:/ReviewHelpfulnessPrediction\BuildedClassifier/bestClassifierAcc.txt里面)'''
    bestClassifier = read_best_classifier()
    print bestClassifier
    '''装载分类器,预测分类结果'''
    loadClassifierPreRes(bestClassifier, unlabedRawData, dataAllFea, preResPath)
    end=time.clock()
    print 'extract feature and predict data time is:',end-begin,'handle data item num is:',len(unlabedRawData)
def store_adj_adv_v_num_feature(dataSetDir,dataSetName,dataSetFileType,sheetNum,colNum,dstDir):
    start=time.clock()
    filepath=dataSetDir+'/'+dataSetName+dataSetFileType
    storepath=dstDir+'/'+dataSetName+'AdjAdvVFea.txt'
    data = tp.get_excel_data(filepath,sheetNum,colNum,'data')
    adj_adv_num = count_adj_adv(data)

    f = open(storepath,'w')
    reviewCount=0
    for i in adj_adv_num:
        f.write(str(i[0])+'\t'+str(i[1])+'\t'+str(i[2])+'\n')
        reviewCount+=1
    f.close()
    end=time.clock()
    return reviewCount,end-start
def store_word_sent_num_features(dataSetDir, dataSetName, dataSetFileType,
                                 sheetNum, colNum, dstDir):
    start = time.clock()
    filepath = dataSetDir + '/' + dataSetName + dataSetFileType
    storepath = dstDir + '/' + dataSetName + 'WordSentNumFea.txt'
    data = tp.get_excel_data(filepath, sheetNum, colNum, 'data')
    word_sent_num = word_sent_count(data)  # Need initiallized

    f = open(storepath, 'w')
    reviewNum = 0
    for i in word_sent_num:
        f.write(str(i[0]) + '\t' + str(i[1]) + '\t' + str(i[2]) + '\n')
        reviewNum += 1
    f.close()
    end = time.clock()
    return reviewNum, end - start
def storeReviewSenValue(dataSetDir,dataSetName,dataSetFileType,sheetNum,colNum,dstDir):
    start=time.clock()
    dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType
    dstPath = dstDir + '/' + dataSetName + 'SnowNLPSentiment.txt'
    reviewSet=tp.get_excel_data(dataSetPath,sheetNum,colNum,'data')
    reviewSentiment=[]
    for review in reviewSet:
        s=SnowNLP(review)
        reviewSentiment.append(s.sentiments)
    reviewNum=0
    f=open(dstPath,'w')
    for x in reviewSentiment:
        f.write(str(x)+'\n')
        reviewNum+=1
    f.close()
    end=time.clock()
    return reviewNum,end-start
    'lines')
moredict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/more.txt',
    'lines')
ishdict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/ish.txt',
    'lines')
insufficientdict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/insufficiently.txt',
    'lines')
inversedict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/inverse.txt',
    'lines')
'''导入数据集'''
review = tp.get_excel_data(
    "D:/ReviewHelpfulnessPrediction/ReviewSet/HTC_Z710t_review_2013.6.5.xlsx",
    1, 4, "data")
'''2 基于字典的情感分析 基本功能'''
'''匹配程度词并设置权重'''
'''parm:word  当前情感词的前面词语 sentiment_value 当前情感词的情感值'''


def match(word, sentiment_value):
    if word in mostdict:
        sentiment_value *= 2.0
    elif word in verydict:
        sentiment_value *= 1.5
    elif word in moredict:
        sentiment_value *= 1.25
    elif word in ishdict:
        sentiment_value *= 0.5
def read_review_set_and_store_score(dataSetPath, sheetNum, colNum,
                                    scoreStorePath):
    review = tp.get_excel_data(dataSetPath, sheetNum, colNum, "data")
    store_sentiment_dictionary_score(review, scoreStorePath)
Exemple #13
0
import itertools
import sklearn
import numpy
import scipy
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

#import sklearn

# 1. Load data
reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx'
review = tp.get_excel_data(reviewDataSetPath, 1, 4, "data")
sentiment_review = tp.seg_fil_senti_excel(
    reviewDataSetPath, 1, 4,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)

# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
# 计算整个语料里面每个词和双词搭配的信息量
# 以单个词语和出现频率为前5000双词作为特征
# return :
# 返回每个词以及得分
'''
return :
第五 1.64131573422
当是 4.8096346704
'''  训练分类器大致过程如下:'''
'''1  装载标记数据,数据预处理(分词及去停用词)'''
'''2  提取特征(程度词性个数特征、句子个数及词语数量特征、基于词典的情感得分特征、积极消极可能性特征)'''
'''3  训练分类器 '''
'''装载数据模块'''

posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
posdata = tp.seg_fil_senti_excel(
    posNegDir + '/posNegLabelData.xls', 1, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
negdata = tp.seg_fil_senti_excel(
    posNegDir + '/posNegLabelData.xls', 2, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
posRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 1, 1,
                               'data')
negRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 2, 1,
                               'data')
'''特征提取模块的函数'''
'''a 提取形容词、副词、动词数量特征'''
'''返回 形容词 副词 动词 特征列表[[adjNum,advNum,vNum],[],],其中参数rawData为原始数据列表(未经分词处理)'''
'''在处理弹幕数据时,时间性能大致1s可以处理1000条数据(词性标注比较耗时 看看可否优化(tp.postagger(review, 'list')))'''


def count_adj_adv_v(rawData):
    begin = time.clock()
    adj_adv_num = []
    a = 0
    d = 0
    v = 0
    for review in rawData: