Ejemplo n.º 1
0
def unionDict(dict1Path,dict2Path):
    dict1=tp.get_txt_data(dict1Path,'lines')
    dict2=tp.get_txt_data(dict2Path,'lines')
    dict={}
    for x in dict1:
        dict.setdefault(x,1)
    for x in dict2:
        if dict.has_key(x)==False:
            dict.setdefault(x,1)
        else:
            dict[x]+=1
    print len(dict1),len(dict2)
    print len(dict)
    return dict
def testLabelDataAcc():
    begin = time.clock()
    '''获得原始数据路径'''
    # reviewDataSetDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
    # reviewDataSetName = 'posNegLabelData'
    # reviewDataSetFileType = '.xls'
    #dataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    dataSetPath = tp.get_txt_data(
        'D:/ReviewHelpfulnessPrediction/LabelDataPath.txt', 'line')
    reviewDataSetDir, reviewDataSetName, reviewDataSetFileType = parseFilePath(
        dataSetPath)
    '''获得目标数据路径'''
    dstSavePath = reviewDataSetDir + '/' + reviewDataSetName + 'BasedDictSentimentScore.txt'
    '''获得原始数据'''
    posreview = tp.get_excel_data(dataSetPath, 1, 1, "data")
    negreview = tp.get_excel_data(dataSetPath, 2, 1, "data")
    review = posreview + negreview
    '''得到每句评论[[PosSum, NegSum],[],]'''
    sentiment_score_list = get_review_set_sentiement_score(review)
    '''得到每句评论的整体得分'''
    sentiment_overall_score = get_sentiment_overall_score_to_txt(
        sentiment_score_list, review, dstSavePath)
    labelClass = []
    for pos in range(len(posreview)):
        labelClass.append(1)
    for pos in range(len(negreview)):
        labelClass.append(0)
    # for pos in range(len(sentiment_overall_score)):
    # 	print sentiment_score_list[pos],sentiment_overall_score[pos],labelClass[pos]
    finalAcc = getAccuracy(sentiment_overall_score, labelClass)
    print 'sentiment Analyze Based Dictionary Accuracy:', finalAcc, 'data item num:', len(
        review)
    return finalAcc, len(review)
def remove_duplicate_comment(srcpath, para, excelpath):
    begin = time.clock()
    raw_data = tp.get_txt_data(srcpath, para)
    review_diff_set = {}
    pre_count = len(raw_data)
    cur_count = 0
    for x in raw_data:
        if review_diff_set.has_key(x) == False:
            review_diff_set[x] = 1
            cur_count += 1
        else:
            review_diff_set[x] += 1
    excel_file = xlwt.Workbook(encoding='utf-8')
    sheet_name = 'label_data'
    sheet_pos = 1
    excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos))
    row_pos = 0
    excel_sheet.write(row_pos, 0, 'review_data')
    excel_sheet.write(row_pos, 1, 'review_count')
    row_pos += 1
    for w, c in review_diff_set.iteritems():
        if row_pos == 65536:
            sheet_pos += 1
            excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos))
            row_pos = 0
            excel_sheet.write(row_pos, 0, 'review_data')
            excel_sheet.write(row_pos, 1, 'review_count')
            row_pos += 1
        excel_sheet.write(row_pos, 0, w)
        excel_sheet.write(row_pos, 1, str(c))
        row_pos += 1
    excel_file.save(excelpath)
    end = time.clock()
    print 'remove same reviews time:', end - begin, 'handle review num:', pre_count, 'different review num:', cur_count
    return pre_count, cur_count
def change_txt_to_excel(srcpath,para,excelpath):
	begin = time.clock()
	raw_data = tp.get_txt_data(srcpath, para)
	excel_file = xlwt.Workbook(encoding='utf-8')
	sheet_name = 'label_data'
	sheet_pos = 1
	excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos))
	row_pos = 0
	excel_sheet.write(row_pos, 0, 'review_data')
	excel_sheet.write(row_pos, 1, 'review_count')
	excel_sheet.write(row_pos, 2, 'is_subjective')
	excel_sheet.write(row_pos, 3, 'sentiment_tendency')
	excel_sheet.write(row_pos, 4, 'is_erotic')
	excel_sheet.write(row_pos, 5, 'key_words')
	row_pos += 1
	for w in raw_data:
		if row_pos == 65536:
			sheet_pos += 1
			excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos))
			row_pos = 0
			excel_sheet.write(row_pos, 0, 'review_data')
			excel_sheet.write(row_pos, 1, 'review_count')
			row_pos += 1
		excel_sheet.write(row_pos, 0, w)
		excel_sheet.write(row_pos, 1, str(1))
		row_pos += 1
	excel_file.save(excelpath)
	end = time.clock()
	print 'remove same reviews time:', end - begin, 'handle review num:', len(raw_data)
def read_txt_review_set_and_store_score(dataSetDir, dataSetName,
                                        dataSetFileType, dstDir):
    start = time.clock()
    dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType
    dstPath = dstDir + '/' + dataSetName + 'SentiDictFea.txt'
    review = tp.get_txt_data(dataSetPath, "lines")
    # for x in review:
    # 	print x
    res = store_sentiment_dictionary_score(review, dstPath)
    end = time.clock()
    return res, end - start
def filt_objective_sentence(srcpath,para,dstpath):
	begin=time.clock()
	raw_data=tp.get_txt_data(srcpath,para)
	f = open(dstpath, 'w')
	count=0
	for x in raw_data:
		if is_single_review_sentiment(x)==True:
			f.write(x.encode('utf-8') + '\n')
			count+=1
	f.close()
	end=time.clock()
	print 'filt objective reviews time:',end-begin,'handle review num:',len(raw_data),'subjective review num:',count
	return count
Ejemplo n.º 7
0
def sentiAnalyzeBaseDict(reviewDataSetName,
                         reviewDataSetFileType,
                         windowSize,
                         posBounder,
                         negBounder,
                         sentScoreBounder,
                         timeInterval=20):
    begin = time.clock()
    '''获得原始数据路径'''
    reviewDataSetDir = 'D:/ReviewHelpfulnessPrediction\BulletData'
    saveResPath = 'D:/ReviewHelpfulnessPrediction/PredictClassRes'
    dataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    figDir = 'D:/ReviewHelpfulnessPrediction\SentimentLineFig'
    '''获得目标数据路径'''
    dstSavePath = saveResPath + '/' + reviewDataSetName + 'BasedDictSentimentScore.txt'
    '''获得原始数据'''
    review = tp.get_txt_data(dataSetPath, "lines")
    '''得到每句评论[[PosSum, NegSum],[],]'''
    sentiment_score_list = get_review_set_sentiement_score(review)
    '''得到每句评论的整体得分'''
    sentiment_overall_score = get_sentiment_overall_score_to_txt(
        sentiment_score_list, review, dstSavePath)
    '''分析评论情感得分数据 按照窗口迭代 获得 情感值 积极比率 消极比率 异常话语位置'''
    # posBounder=0.6
    # negBounder=0.4
    sentimentValueList, posRatioList, negRatioList, strangeWordPos = analyzeSentimentProList(
        sentiment_overall_score, windowSize, posBounder, negBounder,
        sentScoreBounder)
    '''合并重叠区间'''
    finalStrangeWordPos = unionStrangeWordPos(strangeWordPos)
    '''获得平均情感值'''
    meanSentPosPro = getMeanSentimentValue(sentiment_overall_score)
    print 'mean sentiment postive probility', meanSentPosPro
    overallPosRatio = getOverallPosRatio(sentiment_overall_score, posBounder)
    overallNegRatio = getOverallNegRatio(sentiment_overall_score, negBounder)
    '''输出异常话语位置'''
    outputStrangeWordPosInTxt(finalStrangeWordPos, dstSavePath)
    '''绘制情感曲线图'''
    drawSentimentLine(sentimentValueList,
                      figDir + '/' + reviewDataSetName + 'SentCurveDA.png')
    drawPosNegRatioPie(overallPosRatio, overallNegRatio,
                       figDir + '/' + reviewDataSetName + 'PosNegRatioDA.png')
    '''输出异常话语'''
    outputStrangeWords(finalStrangeWordPos, review)
    '''绘制情感波动动态图'''
    #drawSentimentChangeLine(sentimentValueList, timeInterval, windowSize, -30, 30)
    end = time.clock()
    print 'sentiment Analyze based dict running time:', end - begin, 'handle review num:', len(
        review)
def extractFeaPreUnlabelTxtData(rawDataPath,preResPath):
    begin=time.clock()
    '''获取原始数据列表'''
    unlabedRawData = tp.get_txt_data(rawDataPath, 'lines')
    '''获取经分词及去停用词处理后的数据列表'''
    unlabedSegFiltData = tp.seg_fil_txt(rawDataPath,'lines')
    '''提取数据特征'''
    dataAllFea = extractAllFea(unlabedRawData, unlabedSegFiltData)
    '''读取最佳分类器(最佳分类器名字位于D:/ReviewHelpfulnessPrediction\BuildedClassifier/bestClassifierAcc.txt里面)'''
    bestClassifier = read_best_classifier()
    print bestClassifier
    '''装载分类器,预测分类结果'''
    loadClassifierPreRes(bestClassifier, unlabedRawData, dataAllFea, preResPath)
    end=time.clock()
    print 'extract feature and predict data time is:',end-begin,'handle data item num is:',len(unlabedRawData)
def get_all_trainset(dimension):
    best_words = find_best_words(dimension)
    #增加标注的关键词
    select_key_words = tp.get_txt_data(
        'D:/ReviewHelpfulnessPrediction\KeyWords/PosNegKeyWords.txt', 'lines')
    for x in select_key_words:
        best_words.add(x)
    posFeatures = pos_features(best_word_features_com,
                               best_words)  #提取积极文本里面的数据
    negFeatures = neg_features(best_word_features_com,
                               best_words)  #提取消极文本里面的数据
    shuffle(posFeatures)  # 将序列的所有元素随机排列
    shuffle(negFeatures)
    train_set = posFeatures + negFeatures
    return train_set
def predictTxtDataSentTagProToExcel(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,desDir):
    reviewDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    preDataResPath = desDir + '/' + reviewDataSetName + 'RawDataTagProFea.xls'
    start = time.clock()
    review = tp.get_txt_data(reviewDataSetPath, "lines")  # 读取待分类数据
    # 将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_txt(reviewDataSetPath,'lines')
    # 提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    # classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl'
    # 装载分类器
    clf = pickle.load(open(classifierPath))
    dataItemCount = len(sentiment_review)
    # 分类之预测数据类标签
    data_tag = clf.batch_classify(review_feature)
    # 分类之预测数据积极、消极可能性
    res_pro = clf.batch_prob_classify(review_feature)
    preResFile = xlwt.Workbook(encoding='utf-8')
    sheetName='RawDataTagProFea'
    sheetPos=0
    preResSheet = preResFile.add_sheet(sheetName+str(sheetPos))
    posProbility = []
    excelRowPos=0
    for rowPos in range(dataItemCount):
        if excelRowPos==65536:
            sheetPos+=1
            preResSheet=preResFile.add_sheet(sheetName+str(sheetPos))
            excelRowPos=0
        preResSheet.write(excelRowPos, 0, review[rowPos])  # 原始数据
        preResSheet.write(excelRowPos, 1, data_tag[rowPos])  # 类标签
        preResSheet.write(excelRowPos, 2, str(res_pro[rowPos].prob('pos')))  # 积极概率
        posProbility.append(res_pro[rowPos].prob('pos'))
        preResSheet.write(excelRowPos, 3, str(res_pro[rowPos].prob('neg')))  # 消极概率
        feature = ''
        # 特征里面可能出现二元词的情况
        for x in review_feature[rowPos].keys():
            if type(x) is not nltk.types.TupleType:
                feature += x
            else:
                feature += '_'.join(x)
            feature += ' '
        preResSheet.write(excelRowPos, 4, feature)  # 特征
        excelRowPos+=1
    preResFile.save(preDataResPath)
    end = time.clock()
    print 'handle sentences num:', dataItemCount, ' classify time:', end - start
    return posProbility,preDataResPath,review
def predTxtDataSentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,desDir):
    reviewDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    oriDataPath = desDir + '/' + reviewDataSetName + 'OriData.txt'
    oriDataFeaPath = desDir + '/' + reviewDataSetName + 'OriFea.txt'
    preResStorePath = desDir + '/' + reviewDataSetName + 'ClassPro.txt'
    preTagStorePath = desDir + '/' + reviewDataSetName + 'ClassTag.txt'
    start = time.clock()
    # reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx'
    # reviewDataSetPath='D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet/pos_review.xlsx'
    review = tp.get_txt_data(reviewDataSetPath, "lines")  # 读取待分类数据
    # 将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_txt(reviewDataSetPath,'lines')
    # 提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    # classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl'
    # 装载分类器
    clf = pickle.load(open(classifierPath))
    # 分类之预测数据类标签
    data_tag = clf.batch_classify(review_feature)
    p_file = open(preTagStorePath, 'w')
    for i in data_tag:
        p_file.write(str(i) + '\n')
    p_file.close()
    # 分类之预测数据积极、消极可能性
    pred = clf.batch_prob_classify(review_feature)
    # 记录分类结果 积极可能性 消极可能性
    p_file = open(preResStorePath, 'w')
    reviewCount = 0
    for i in pred:
        reviewCount += 1
        p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n')
    p_file.close()
    # 记录原始数据
    p_file = open(oriDataPath, 'w')
    for d in review:
        p_file.write(d.encode('utf-8') + '\n')
    p_file.close()
    p_file = open(oriDataFeaPath, 'w')
    # 记录原始数据特征提取结果
    for d in review_feature:
        for w, b, in d.iteritems():
            p_file.write(w.encode('utf-8') + ' ' + str(b) + '\t')
        p_file.write('\n')
    p_file.close()
    end = time.clock()
    return reviewCount, end - start
def sentiAnalyzeBaseDictUI(reviewDataSetDir,
                           reviewDataSetName,
                           reviewDataSetFileType,
                           windowSize,
                           posBounder,
                           negBounder,
                           sentScoreBounder,
                           timeInterval=20):
    begin = time.clock()

    desDir = 'D:/ReviewHelpfulnessPrediction/PredictClassRes'
    figDir = 'D:/ReviewHelpfulnessPrediction\SentimentLineFig'
    strangeWordDir = 'D:/ReviewHelpfulnessPrediction\StrangeWords'

    rawDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    strangeWordPath = strangeWordDir + '/' + reviewDataSetName + 'DA.txt'
    classifyResPath = desDir + '/' + reviewDataSetName + 'DA.txt'
    sentimentLinePath = figDir + '/' + reviewDataSetName + 'SCDA.png'
    posNegRatioPath = figDir + '/' + reviewDataSetName + 'PNRDA.png'

    review = tp.get_txt_data(rawDataSetPath, "lines")
    '''得到每句评论[[PosSum, NegSum],[],]'''
    sentiment_score_list = get_review_set_sentiement_score(review)
    '''得到每句评论的整体得分'''
    sentiment_overall_score = get_sentiment_overall_score_to_txt(
        sentiment_score_list, review, classifyResPath)
    '''分析评论情感得分数据 按照窗口迭代 获得 情感值 积极比率 消极比率 异常话语位置'''
    sentimentValueList, posRatioList, negRatioList, strangeWordPos = analyzeSentimentProList(
        sentiment_overall_score, windowSize, posBounder, negBounder,
        sentScoreBounder)
    finalStrangeWordPos = unionStrangeWordPos(strangeWordPos)

    #meanSentPosPro = getMeanSentimentValue(sentiment_overall_score)
    overallPosRatio = getOverallPosRatio(sentiment_overall_score, posBounder)
    overallNegRatio = getOverallNegRatio(sentiment_overall_score, negBounder)

    drawSentimentLine(sentimentValueList, sentimentLinePath)
    drawPosNegRatioPie(overallPosRatio, overallNegRatio, posNegRatioPath)
    saveStrangeWordsToTxt(finalStrangeWordPos, review, strangeWordPath)

    end = time.clock()
    print 'sentiment Analyze based dict running time:', end - begin, 'handle review num:', len(
        review)
    return strangeWordPath, sentimentLinePath, classifyResPath
Ejemplo n.º 13
0
def storeTxtReviewSenValue(dataSetDir, dataSetName, dataSetFileType, dstDir):
    start = time.clock()
    dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType
    dstPath = dstDir + '/' + dataSetName + 'SnowNLPSentiment.txt'
    reviewSet = tp.get_txt_data(dataSetPath, 'lines')
    reviewSentiment = []
    for review in reviewSet:
        if review == '':
            continue
        s = SnowNLP(review)
        reviewSentiment.append(s.sentiments)
    reviewNum = 0
    f = open(dstPath, 'w')
    for x in reviewSentiment:
        f.write(str(x) + '\n')
        reviewNum += 1
    f.close()
    end = time.clock()
    return reviewNum, end - start
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew
    
    # Read txt file contain stopwords
    stopwords = tp.get_txt_data('D:/ReviewHelpfulnessPrediction\PreprocessingModule/stopword.txt', 'lines')
 
    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != ' ']
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
def get_trainset_testset_testtag(dimension):
    best_words = find_best_words(dimension)
    #增加标注的关键词
    select_key_words = tp.get_txt_data(
        'D:/ReviewHelpfulnessPrediction\KeyWords/PosNegKeyWords.txt', 'lines')
    for x in select_key_words:
        best_words.add(x)
    posFeatures = pos_features(best_word_features_com,
                               best_words)  #提取积极文本里面的数据
    negFeatures = neg_features(best_word_features_com,
                               best_words)  #提取消极文本里面的数据
    # shuffle(posFeatures)  # 将序列的所有元素随机排列
    # shuffle(negFeatures)
    train_pos = int(len(pos_review) * 0.8)
    train_neg = int(len(neg_review) * 0.8)
    train_set_pos = posFeatures[:train_pos]
    train_set_neg = negFeatures[:train_neg]
    test_set = posFeatures[train_pos:] + negFeatures[train_neg:]
    test_fea, test_tag = zip(*test_set)  # 将特征和类标签分离开
    return train_set_pos, train_set_neg, test_fea, test_tag
Ejemplo n.º 16
0
def storeTxtReviewSenValue(dataSetDir,dataSetName,dataSetFileType,dstDir):
    start=time.clock()
    dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType
    dstPath = dstDir + '/' + dataSetName + 'SnowNLPSentiment.txt'
    reviewSet=tp.get_txt_data(dataSetPath,'lines')
    reviewSentiment=[]
    rawReview=[]
    for review in reviewSet:
        if review=='':
            continue
        s=SnowNLP(review)
        rawReview.append(review)
        reviewSentiment.append(s.sentiments)
    reviewNum=0
    f=open(dstPath,'w')
    for pos in range(len(reviewSentiment)):
        f.write(str(rawReview[pos].encode('utf-8'))+'\t'+str(reviewSentiment[pos])+'\n')
        reviewNum+=1
    f.close()
    end=time.clock()
    return reviewNum,end-start
def predictTxtDataSentTagProToTxt(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,desDir):
    reviewDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType
    preDataResPath = desDir + '/' + reviewDataSetName + 'RawDataTagProFea.txt'
    start = time.clock()
    review = tp.get_txt_data(reviewDataSetPath, "lines")  # 读取待分类数据
    # 将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_txt(reviewDataSetPath,'lines')
    # 提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    # classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl'
    # 装载分类器
    clf = pickle.load(open(classifierPath))
    dataItemCount = len(sentiment_review)
    # 分类之预测数据类标签
    data_tag = clf.batch_classify(review_feature)
    # 分类之预测数据积极、消极可能性
    res_pro = clf.batch_prob_classify(review_feature)
    preResFile = open(preDataResPath,'w')
    posProbility = []
    for rowPos in range(dataItemCount):
        posProbility.append(res_pro[rowPos].prob('pos'))
        feature = ''
        # 特征里面可能出现二元词的情况
        for x in review_feature[rowPos].keys():
            if type(x) is not nltk.types.TupleType:
                feature += x
            else:
                feature += '_'.join(x)
            feature += ' '
        # preResFile.write(
        #         review[rowPos].encode('utf-8') + '\t' + str(data_tag[rowPos]))
        preResFile.write(
            review[rowPos].encode('utf-8')  + '\t' + str(data_tag[rowPos]) + '\t' + str(res_pro[rowPos].prob('pos')) + '\t' + str(
                res_pro[rowPos].prob('neg'))+'\t'+feature.encode('utf-8')+'\n')
    preResFile.close()
    end = time.clock()
    print 'handle sentences num:', dataItemCount, ' classify time:', end - start
    return posProbility,preDataResPath,review
积极消极标记数据 posNegLabelData.xls
主客观标记数据 subObjLabelData.xls
鉴黄标记数据 eroNorLabelData.xls
'''

import textProcessing as tp
import numpy as np
import time
import xlwt
import xlrd
import chardet
import os

'''导入情感词典'''
dictDir='D:/ReviewHelpfulnessPrediction\SentimentDict'
posdict = tp.get_txt_data(dictDir+"/posdict.txt","lines")
negdict = tp.get_txt_data(dictDir+"/negdict.txt","lines")

'''过滤器 过滤掉不含主观情感的语句 客观语句'''

'''构建情感词典 这里只简单地分为积极和消极'''
sentiment_dict=posdict+negdict

'''判断单条评论是否为具备情感倾向语句 如果评论里有一个词位于情感词典中,则可认为该句具备情感倾向'''
def is_single_review_sentiment(review):
	cuted_review = tp.cut_sentence_2(review)# 将评论切割成句子
	for sent in cuted_review:
		seg_sent = tp.segmentation(sent, 'list')# 将句子做分词处理
		for word in seg_sent:
		    if word in  sentiment_dict:
		        return True
Ejemplo n.º 19
0
·计算文章的情感得分
·考虑到语句中的褒贬并非稳定分布,以上步骤对于积极和消极的情感词分开执行,最终的到两个分值,分别表示文本的正向情感值和负向情感值。
'''

import textProcessing as tp
import numpy as np
import time
import xlwt
import xlrd
from matplotlib import pyplot as plt
from matplotlib import animation
'''1 导入情感词典'''
'''导入情感词典'''
begin = time.clock()
dictDir = 'D:/ReviewHelpfulnessPrediction\SentimentDict'
posdict = tp.get_txt_data(dictDir + "/posdict.txt", "lines")
negdict = tp.get_txt_data(dictDir + "/negdict.txt", "lines")
'''导入形容词、副词、否定词等程度词字典'''
mostdict = tp.get_txt_data(dictDir + '/most.txt', 'lines')
verydict = tp.get_txt_data(dictDir + '/very.txt', 'lines')
moredict = tp.get_txt_data(dictDir + '/more.txt', 'lines')
ishdict = tp.get_txt_data(dictDir + '/ish.txt', 'lines')
insufficientdict = tp.get_txt_data(dictDir + '/insufficiently.txt', 'lines')
inversedict = tp.get_txt_data(dictDir + '/inverse.txt', 'lines')
end = time.clock()
print 'load dictionary time:', end - begin
'''2 基于字典的情感分析 基本功能'''
'''匹配程度词并设置权重'''
'''parm:word  当前情感词的前面词语 sentiment_value 当前情感词的情感值'''

from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn import cross_validation
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
'''1 导入数据模块'''

# posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
# posNegPath=posNegDir+'/posNegLabelData.xls'
# 标记数据所在路径保存在D:/ReviewHelpfulnessPrediction/LabelDataPath.txt文件中
posNegPath = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction/LabelDataPath.txt', 'line')
print posNegPath
pos_review = tp.seg_fil_senti_excel(
    posNegPath, 1, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
neg_review = tp.seg_fil_senti_excel(
    posNegPath, 2, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
print 'postive review num is:', len(pos_review), 'negtive review num is:', len(
    neg_review)

shuffle(pos_review)
shuffle(neg_review)
"""
'''
计算一条评论 积极、消极得分,平均得分,标准偏差
模块目标是提取一条评论的 positive/negative score, average score and standard deviation features (all 6 features)
情感分析依赖于情感词典
'''

import textProcessing as tp
import numpy as np
import time
import xlwt
import xlrd
'''1 导入情感词典以及数据集'''
'''导入情感词典'''
posdict = tp.get_txt_data(
    "D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\PositiveAndNegativeDictionary/posdict.txt",
    "lines")
negdict = tp.get_txt_data(
    "D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\PositiveAndNegativeDictionary/negdict.txt",
    "lines")
'''导入形容词、副词、否定词等程度词字典'''
mostdict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/most.txt',
    'lines')
verydict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/very.txt',
    'lines')
moredict = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/more.txt',
    'lines')
ishdict = tp.get_txt_data(