Beispiel #1
0
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(
            filepath, sheetnum, colnum,
            'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list'))  # Seg every reivew

    # Read txt file contain stopwords
    """
    stopwords = tp.get_txt_data('D:/code/stopword.txt', 'lines')
    """
    stopwords = tp.get_txt_data(
        'E:/GraduationProject/pythoncode/project/Prediction/main/PreprocessingModule/stopword.txt',
        'lines')

    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
Beispiel #2
0
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(
            filepath, sheetnum, colnum,
            'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list'))  # Seg every reivew

    # Read txt file contain stopwords
    stopwords = tp.get_txt_data(
        '/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt',
        'lines')

    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
def store_word_sent_num_features(filepath, sheetnum, colnum, data, storepath):
    data = tp.get_excel_data(filepath, sheetnum, colnum, 'data')

    word_sent_num = word_sent_count(data)  #需要初始化
    print word_sent_num
    f = open(storepath, 'w')
    for i in word_sent_num:
        f.write(str(i[0]) + ' ' + str(i[1]) + ' ' + str(i[2]) + '\n')
    f.close()
def store_adj_adv_v_num_feature(filepath, sheetnum, colnum, data, storepath):
    data = tp.get_excel_data(filepath, sheetnum, colnum, 'data')

    adj_adv_num = count_adj_adv(data)

    f = open(storepath, 'w')
    for i in adj_adv_num:
        f.write(str(i[0]) + ' ' + str(i[1]) + ' ' + str(i[2]) + '\n')
    f.close()
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew

    # Read txt file contain stopwords
    stopwords = tp.get_txt_data('/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt', 'lines')

    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != ' ']
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
Beispiel #6
0
import sklearn
from nltk.classify.scikitlearn import SklearnClassifier

##my classifier path
filefeature = 'E:/GraduationProject/pythoncode/project/Prediction/main/result/feature_word_ngram.txt'
filename = 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/senti_class_word_ngram.pkl'

# 1. Load data
"""
review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1", "data")
sentiment_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/review_set.xlsx", "1", "1")
"""

review = tp.get_excel_data(
    "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx",
    1, 12, "data")

sentiment_review = tp.seg_fil_senti_excel(
    "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx",
    1, 12)


# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel(
        "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx",
        1, 1)
    negdata = tp.seg_fil_senti_excel(
        "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx",
import textprocessing as tp
import pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn


# 1. Load data
review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1")
sentiment_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/review_set.xlsx", "1", "1")


# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
sys.path.append("./Preprocessing module/")
import textprocessing as tp
import pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn


# 1. Load data
review = tp.get_excel_data("../../../Review set/review_set.xlsx", 1, 7, "data")
sentiment_review = tp.seg_fil_senti_excel("../../../Review set/review_set.xlsx", 1, 7)


# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
Beispiel #9
0
__author__ = 'anchengwu'
#coding=utf-8

import sys
sys.path.append("../../../Preprocessing module")

import pos_neg_senti_dict_feature as pn
import textprocessing as tp

# Load dataset
review = tp.get_excel_data(
    "../Machine learning features/seniment review set/pos_review.xlsx", 1, 1,
    "data")

#test single dataset
print pn.single_review_sentiment_score(
    '买过散装的粽子才来买礼盒的,礼盒很大气,比超市买的100多的还要好,配置也不错,肉的素的都有,刚煮了个蛋黄粽子很不错,米好蛋黄也黄很香,老板态度很好,还想买一份~'
    .decode('utf8'))

#test all dataset
for i in pn.all_review_sentiment_score(pn.sentence_sentiment_score(review)):
    print i
Beispiel #10
0
#! /usr/bin/env python2.7
#coding=utf-8
import textprocessing as tp

review = tp.get_excel_data("../data/review_set.xlsx", 1, 1, "data")
review_txt = open('reivew.txt', 'wb+')
for r in review:
    print r
    review_txt.write(r)
    review_txt.write('\n')

review_txt.close()

sys.path.append("./Preprocessing module/")
import textprocessing as tp
import cPickle as pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn


# 1. Load data
review = tp.get_excel_data("review_set.xlsx", 1, 7, "data")
sentiment_review = tp.seg_fil_senti_excel("review_set.xlsx", 1, 7)


# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
import textprocessing as tp
import pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn


# 1. Load data
review = tp.get_excel_data("/home/hadoop/coding/Review set/HTC Z710t_review_2013.6.5.xlsx",1,12, "data")
sentiment_review = tp.seg_fil_senti_excel("/home/hadoop/coding/Review set/Meizu MX_review_2013.6.7.xlsx", 1, 12)



# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/pos_review.xlsx",1,1)
    negdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)#把文本变成双词搭配的形式
    bigram_finder = BigramCollocationFinder.from_words(negWords)
Beispiel #13
0
"""

import textprocessing as tp
import pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn

# 1. Load data
review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1",
                           "data")
sentiment_review = tp.seg_fil_senti_excel(
    "D:/code/sentiment_test/review_set.xlsx", "1", "1")


# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx",
                                     "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx",
                                     "1", "1")

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
Beispiel #14
0
__author__ = 'anchengwu'
#coding=utf-8

import sys
sys.path.append("../../../Preprocessing module")

import pos_neg_senti_dict_feature as pn
import textprocessing as tp

# Load dataset
review = tp.get_excel_data("../Machine learning features/seniment review set/pos_review.xlsx", 1, 1, "data")

#test single dataset
print pn.single_review_sentiment_score('买过散装的粽子才来买礼盒的,礼盒很大气,比超市买的100多的还要好,配置也不错,肉的素的都有,刚煮了个蛋黄粽子很不错,米好蛋黄也黄很香,老板态度很好,还想买一份~'.decode('utf8'))

#test all dataset
for i in pn.all_review_sentiment_score(pn.sentence_sentiment_score(review)):
	print i

# Load sentiment dictionary
posdict = tp.get_txt_data("D:/code/sentiment_dictionary/posdict.txt", "lines")
negdict = tp.get_txt_data("D:/code/sentiment_dictionary/negdict.txt", "lines")

# Load adverbs of degree dictionary
mostdict = tp.get_txt_data('D:/code/sentiment_dictionary/most.txt', 'lines')
verydict = tp.get_txt_data('D:/code/sentiment_dictionary/very.txt', 'lines')
moredict = tp.get_txt_data('D:/code/sentiment_dictionary/more.txt', 'lines')
ishdict = tp.get_txt_data('D:/code/sentiment_dictionary/ish.txt', 'lines')
insufficientdict = tp.get_txt_data(
    'D:/code/sentiment_dictionary/insufficiently.txt', 'lines')
inversedict = tp.get_txt_data('D:/code/sentiment_dictionary/inverse.txt',
                              'lines')

# Load dataset
review = tp.get_excel_data("D:/code/review_set.xlxs", "1", "1", "data")


# 2. Sentiment dictionary analysis basic function
# Function of matching adverbs of degree and set weights
def match(word, sentiment_value):
    if word in mostdict:
        sentiment_value *= 2.0
    elif word in verydict:
        sentiment_value *= 1.5
    elif word in moredict:
        sentiment_value *= 1.25
    elif word in ishdict:
        sentiment_value *= 0.5
    elif word in insufficientdict:
        sentiment_value *= 0.25
    'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/SentimentDictionaryFeatures/SentimentDictionary/AdverbsOfDegreeDictionary/ish.txt',
    'lines')
insufficientdict = tp.get_txt_data(
    'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/SentimentDictionaryFeatures/SentimentDictionary/AdverbsOfDegreeDictionary/insufficiently.txt',
    'lines')
inversedict = tp.get_txt_data(
    'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/SentimentDictionaryFeatures/SentimentDictionary/AdverbsOfDegreeDictionary/inverse.txt',
    'lines')

# Load dataset
#review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/HTC.xlsx", 1, 12, "data")
#review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/OPPO.xlsx", 1, 12, "data")
#review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/MeiZuMX.xlsx", 1, 12, "data")
#review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx", 1, 12, "data")
review = tp.get_excel_data(
    "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Motorala.xlsx",
    1, 11, "data")

#获取excel中第一页的第一列的值
#print review[1]


# 2. Sentiment dictionary analysis basic function
# Function of matching adverbs of degree and set weights
def match(word, sentiment_value):
    if word in mostdict:
        sentiment_value *= 2.0
    elif word in verydict:
        sentiment_value *= 1.5
    elif word in moredict:
        sentiment_value *= 1.25
# 1. Load dictionary and dataset
# Load sentiment dictionary
posdict = tp.get_txt_data("D:/code/sentiment_dictionary/posdict.txt","lines")
negdict = tp.get_txt_data("D:/code/sentiment_dictionary/negdict.txt","lines")

# Load adverbs of degree dictionary
mostdict = tp.get_txt_data('D:/code/sentiment_dictionary/most.txt', 'lines')
verydict = tp.get_txt_data('D:/code/sentiment_dictionary/very.txt', 'lines')
moredict = tp.get_txt_data('D:/code/sentiment_dictionary/more.txt', 'lines')
ishdict = tp.get_txt_data('D:/code/sentiment_dictionary/ish.txt', 'lines')
insufficientdict = tp.get_txt_data('D:/code/sentiment_dictionary/insufficiently.txt', 'lines')
inversedict = tp.get_txt_data('D:/code/sentiment_dictionary/inverse.txt', 'lines')

# Load dataset
review = tp.get_excel_data("D:/code/review_set.xlxs", "1", "1", "data")


# 2. Sentiment dictionary analysis basic function
# Function of matching adverbs of degree and set weights
def match(word, sentiment_value):
	if word in mostdict:
		sentiment_value *= 2.0
	elif word in verydict:
	    sentiment_value *= 1.5
	elif word in moredict:
	    sentiment_value *= 1.25
	elif word in ishdict:
	    sentiment_value *= 0.5
	elif word in insufficientdict:
	    sentiment_value *= 0.25
Beispiel #18
0
#! /usr/bin/env python2.7
# coding=utf-8
import textprocessing as tp

review = tp.get_excel_data("../data/review_set.xlsx", 1, 1, "data")
review_txt = open("reivew.txt", "wb+")
for r in review:
    print r
    review_txt.write(r)
    review_txt.write("\n")

review_txt.close()
Beispiel #19
0
    '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/more.txt',
    'lines')
ishdict = tp.get_txt_data(
    '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/ish.txt',
    'lines')
insufficientdict = tp.get_txt_data(
    '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/insufficiently.txt',
    'lines')
inversedict = tp.get_txt_data(
    '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/inverse.txt',
    'lines')

# Load dataset
#review = tp.get_excel_data("/home/sooda/nlp/Review-Helpfulness-Prediction/data/review_set.xlxs", "1", "1", "data")
review = tp.get_excel_data(
    "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/review_set.xlsx",
    1, 1, "data")


# 2. Sentiment dictionary analysis basic function
# Function of matching adverbs of degree and set weights
def match(word, sentiment_value):
    if word in mostdict:
        sentiment_value *= 2.0
    elif word in verydict:
        sentiment_value *= 1.5
    elif word in moredict:
        sentiment_value *= 1.25
    elif word in ishdict:
        sentiment_value *= 0.5
    elif word in insufficientdict:
import textprocessing as tp
import pickle
import itertools
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import sklearn


# 1. Load data
review = tp.get_excel_data("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/review_set.xlsx", 1, 1, "data")
sentiment_review = tp.seg_fil_senti_excel("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/review_set.xlsx", 1, 1)


# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)