Exemple #1
0
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(
            filepath, sheetnum, colnum,
            'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list'))  # Seg every reivew

    # Read txt file contain stopwords
    stopwords = tp.get_txt_data(
        '/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt',
        'lines')

    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
def seg_filter_txt(filepath, storepath):
    txtfile = open(filepath, 'r')
    txtdata = txtfile.readlines()
    txtfile.close()

    review_data = tp.segmentation(txtdata[0], 'list')

    #stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r')

    stopfile = open(
        'E:/GraduationProject/pythoncode/project/Prediction/main/PreprocessingModule/stopword.txt',
        'r')

    stopdata1 = stopfile.readlines()
    stopdata2 = ''.join(stopdata1)
    stopwords = stopdata2.decode('utf8').split('\n')
    stopfile.close()

    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    fil_file = open(storepath, 'w')
    for word in seg_fil_result:
        words = ''.join(word)
        fil_file.write(words.encode('utf8') + ' ')
        # fil_file.write(word.encode('utf8')+' ')
    fil_file.close()
def seg_filter_txt(filepath, storepath):
    txtfile = open(filepath, 'r')
    txtdata = txtfile.readlines()
    txtfile.close()

    review_data = tp.segmentation(txtdata[0], 'list')

    stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r')
    stopdata1 = stopfile.readlines()
    stopdata2 = ''.join(stopdata1)
    stopwords = stopdata2.decode('utf8').split('\n')
    stopfile.close()

    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    fil_file = open(storepath, 'w')
    for word in seg_fil_result:
        fil_file.write(word.encode('utf8') + ' ')
    fil_file.close()
Exemple #4
0
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(
            filepath, sheetnum, colnum,
            'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list'))  # Seg every reivew

    # Read txt file contain stopwords
    """
    stopwords = tp.get_txt_data('D:/code/stopword.txt', 'lines')
    """
    stopwords = tp.get_txt_data(
        'E:/GraduationProject/pythoncode/project/Prediction/main/PreprocessingModule/stopword.txt',
        'lines')

    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
 def cut_sentences_words(self, review):
     sent_words = []
     cuted_review = tp.cut_sentence_2(review)
     for sent in cuted_review:
         seg_sent = tp.segmentation(sent, 'list')
         #seg_sent = self.stopWordFilter(seg_sent)
         sent_words.append(seg_sent)
     return sent_words
def word_sent_count(dataset):
    word_sent_count = []
    for review in dataset:
        sents = tp.cut_sentence_2(review)  #将每条评论进行分句,得到每条评论的 分句
        words = tp.segmentation(review, 'list')  #将每条评论进行分词,保存分词结果到列表数组中
        sent_num = len(sents)  #记录每条评论分句完后的句子数量
        word_num = len(words)  #记录每条评论的 分词后的 词数
        sent_word = float(word_num) / float(sent_num)  #评论的长度 = 分词数/分句数
        word_sent_count.append([word_num, sent_num, sent_word])
    return word_sent_count
def word_sent_count(dataset):
    word_sent_count = []
    for review in dataset:
        sents = tp.cut_sentence_2(review)
        words = tp.segmentation(review,'list')
        sent_num = len(sents)
        word_num = len(words)
        sent_word = float(word_num)/float(sent_num)  # review length = word number/sentence number
        word_sent_count.append([word_num, sent_num, sent_word])
    return word_sent_count
def word_sent_count(dataset):
    word_sent_count = []
    for review in dataset:
        sents = tp.cut_sentence_2(review)
        words = tp.segmentation(review, "list")
        sent_num = len(sents)
        word_num = len(words)
        sent_word = float(word_num) / float(sent_num)  # review length = word number/sentence number
        word_sent_count.append([word_num, sent_num, sent_word])
    return word_sent_count
def sentence_sentiment_score(dataset):
    cuted_review = []
    for cell in dataset:
        cuted_review.append(tp.cut_sentence_2(cell))

    single_review_count = []
    all_review_count = []
    for review in cuted_review:
        for sent in review:
            seg_sent = tp.segmentation(sent, "list")  # 分词
            i = 0  # word position counter
            a = 0  # sentiment word position
            poscount = 0  # count a pos word
            negcount = 0
            for word in seg_sent:
                if word in posdict:
                    poscount += 1
                    for w in seg_sent[a:i]:
                        poscount = match(w, poscount)
                    a = i + 1

                elif word in negdict:
                    negcount += 1
                    for w in seg_sent[a:i]:
                        negcount = match(w, negcount)
                    a = i + 1

                elif word == "!".decode("utf8") or word == "!".decode("utf8"):
                    for w2 in seg_sent[::-1]:
                        if w2 in posdict:
                            poscount += 2
                            break
                        elif w2 in negdict:
                            negcount += 2
                            break
                else:
                    pos = tp.postagger(word, "list")
                    for k in pos:
                        if k[1] == "a":
                            fo.write(word.encode("utf8") + "\n")
                        elif k[1] == "d":
                            fo.write(word.encode("utf8") + "\n")
                        elif k[1] == "v":
                            fo.write(word.encode("utf8") + "\n")
                        elif k[1] == "n":
                            fo.write(word.encode("utf8") + "\n")
                i += 1

            single_review_count.append(transform_to_positive_num(poscount, negcount))  # 评论分句子后,每条句子的正负权值
        all_review_count.append(
            single_review_count
        )  # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...]
        single_review_count = []

    return all_review_count
def single_review_sentiment_score(weibo_sent):
    single_review_senti_score = []
    cuted_review = tp.cut_sentence(weibo_sent)  # 句子切分,单独对每个句子进行分析

    for sent in cuted_review:
        seg_sent = tp.segmentation(sent)  # 分词
        seg_sent = tp.del_stopwords(seg_sent)[:]
        #for w in seg_sent:
        #   print w,
        i = 0  # 记录扫描到的词的位置
        s = 0  # 记录情感词的位置
        poscount = 0  # 记录该分句中的积极情感得分
        negcount = 0  # 记录该分句中的消极情感得分

        for word in seg_sent:  # 逐词分析
            #print word
            if word in posdict:  # 如果是积极情感词
                #print "posword:", word
                poscount += 1  # 积极得分+1
                for w in seg_sent[s:i]:
                    poscount = match(w, poscount)
                #print "poscount:", poscount
                s = i + 1  # 记录情感词的位置变化

            elif word in negdict:  # 如果是消极情感词
                #print "negword:", word
                negcount += 1
                for w in seg_sent[s:i]:
                    negcount = match(w, negcount)
                #print "negcount:", negcount
                s = i + 1

            # 如果是感叹号,表示已经到本句句尾
            elif word == "!".decode("utf-8") or word == "!".decode('utf-8'):
                for w2 in seg_sent[::-1]:  # 倒序扫描感叹号前的情感词,发现后权值+2,然后退出循环
                    if w2 in posdict:
                        poscount += 2
                        break
                    elif w2 in negdict:
                        negcount += 2
                        break
            i += 1
        #print "poscount,negcount", poscount, negcount
        single_review_senti_score.append(
            transform_to_positive_num(poscount, negcount))  # 对得分做最后处理
    pos_result, neg_result = 0, 0  # 分别记录积极情感总得分和消极情感总得分
    for res1, res2 in single_review_senti_score:  # 每个分句循环累加
        pos_result += res1
        neg_result += res2
    #print pos_result, neg_result
    result = pos_result - neg_result  # 该条微博情感的最终得分
    result = round(result, 1)
    return result
Exemple #11
0
def get_blog_features():
    import tp_utility as tpu
    import textprocessing as tp
    import MySQLdb
    #get blog df_content
    mysql_cn= MySQLdb.connect('127.0.0.1','root','100811','mydb',charset='utf8')
    df_content = pd.read_sql("select * from weibo_bloginfor limit 5000;", con=mysql_cn)    
    mysql_cn.close()
    #get features in each blog
    blogs= df_content['mc'].values
    moto =  [tp.segmentation(blog) for blog in blogs]
    moto_features = extract_features(moto)
    return moto_features,df_content
Exemple #12
0
def mysentiment_score_list(oneblog):
    cuted_data = []
    for sen in tp.cut_sentence(oneblog):
        cuted_data.append(sen)
    blog_score_list = []
    for sent in cuted_data:  #循环遍历评论中的每一个分句
        segtmp = tp.segmentation(sent)
        #print segtmp
        pos_count = 0
        neg_count = 0
        for word in segtmp:
            if word in posdict:
                pos_count += 1
            elif word in negdict:
                neg_count += 1
        blog_score_list.append([pos_count, neg_count])
    return blog_score_list
def mysentiment_score_list(oneblog):
    cuted_data = []
    for sen in tp.cut_sentence(oneblog):
        cuted_data.append(sen)
    blog_score_list = []
    for sent in cuted_data:  #循环遍历评论中的每一个分句
        segtmp = tp.segmentation(sent)
        #print segtmp
        pos_count = 0
        neg_count = 0
        for word in segtmp:
            if word in posdict:
                pos_count +=1
            elif word in negdict:
                neg_count +=1
        blog_score_list.append([pos_count,neg_count])
    return blog_score_list 
def sentence_sentiment_score(dataset):
    cuted_review = []
    for cell in dataset:
        cuted_review.append(tp.cut_sentence_2(cell))

    single_review_count = []
    all_review_count = []
    for review in cuted_review:
        for sent in review:
            seg_sent = tp.segmentation(sent, 'list')
            i = 0  #word position counter
            a = 0  #sentiment word position
            poscount = 0  #count a pos word
            negcount = 0
            for word in seg_sent:
                if word in posdict:
                    poscount += 1
                    for w in seg_sent[a:i]:
                        poscount = match(w, poscount)
                    a = i + 1

                elif word in negdict:
                    negcount += 1
                    for w in seg_sent[a:i]:
                        negcount = match(w, negcount)
                    a = i + 1

                elif word == '!'.decode('utf8') or word == '!'.decode('utf8'):
                    for w2 in seg_sent[::-1]:
                        if w2 in posdict:
                            poscount += 2
                            break
                        elif w2 in negdict:
                            negcount += 2
                            break
                i += 1

            single_review_count.append(
                transform_to_positive_num(
                    poscount, negcount))  #[[s1_score], [s2_score], ...]
        all_review_count.append(
            single_review_count
        )  # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...]
        single_review_count = []

    return all_review_count
def sentence_sentiment_score(dataset):
    cuted_review = []
    for cell in dataset:
        cuted_review.append(tp.cut_sentence_2(cell))

    single_review_count = []
    all_review_count = []
    for review in cuted_review:
        for sent in review:
            seg_sent = tp.segmentation(sent, "list")
            i = 0  # word position counter
            a = 0  # sentiment word position
            poscount = 0  # count a pos word
            negcount = 0
            for word in seg_sent:
                if word in posdict:
                    poscount += 1
                    for w in seg_sent[a:i]:
                        poscount = match(w, poscount)
                    a = i + 1

                elif word in negdict:
                    negcount += 1
                    for w in seg_sent[a:i]:
                        negcount = match(w, negcount)
                    a = i + 1

                elif word == "!".decode("utf8") or word == "!".decode("utf8"):
                    for w2 in seg_sent[::-1]:
                        if w2 in posdict:
                            poscount += 2
                            break
                        elif w2 in negdict:
                            negcount += 2
                            break
                i += 1

            single_review_count.append(transform_to_positive_num(poscount, negcount))  # [[s1_score], [s2_score], ...]
        all_review_count.append(
            single_review_count
        )  # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...]
        single_review_count = []

    return all_review_count
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew

    # Read txt file contain stopwords
    stopwords = tp.get_txt_data('/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt', 'lines')

    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != ' ']
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
def single_review_sentiment_score(review):  # 可以计算每条评论中,正负指数。
    single_review_senti_score = []
    cuted_review = tp.cut_sentence_2(review)  # cut sentence

    for sent in cuted_review:
        seg_sent = tp.segmentation(sent, "list")  # cut word
        i = 0  # word position counter
        s = 0  # sentiment word position
        poscount = 0  # count a positive word
        negcount = 0  # count a negative word

        for word in seg_sent:
            if word in posdict:
                poscount += 1
                for w in seg_sent[s:i]:
                    poscount = match(w, poscount)  # 句子中每句话正权值
                a = i + 1

            elif word in negdict:
                negcount += 1
                for w in seg_sent[s:i]:
                    negcount = match(w, negcount)  # 句子中每句话负权值
                a = i + 1

            # Match "!" in the review, every "!" has a weight of +2
            elif word == "!".decode("utf8") or word == "!".decode("utf8"):  # 中英文!号
                for w2 in seg_sent[::-1]:  # 截取!号前的一个词
                    if w2 in posdict:
                        poscount += 2
                        break
                    elif w2 in negdict:
                        negcount += 2
                        break
            else:
                fo.write(word)
            i += 1
            # poscount,negcount 评论中每个句子的正负权值
        single_review_senti_score.append(transform_to_positive_num(poscount, negcount))

    review_sentiment_score = sumup_sentence_sentiment_score(single_review_senti_score)
    return review_sentiment_score
def single_review_sentiment_score(review):
    single_review_senti_score = []
    cuted_review = tp.cut_sentence_2(review)

    for sent in cuted_review:
        seg_sent = tp.segmentation(sent, 'list')
        i = 0  # word position counter
        s = 0  # sentiment word position
        poscount = 0  # count a positive word
        negcount = 0  # count a negative word

        for word in seg_sent:
            if word in posdict:
                poscount += 1
                for w in seg_sent[s:i]:
                    poscount = match(w, poscount)
                a = i + 1

            elif word in negdict:
                negcount += 1
                for w in seg_sent[s:i]:
                    negcount = match(w, negcount)
                a = i + 1

            # Match "!" in the review, every "!" has a weight of +2
            elif word == "!".decode('utf8') or word == "!".decode('utf8'):
                for w2 in seg_sent[::-1]:
                    if w2 in posdict:
                        poscount += 2
                        break
                    elif w2 in negdict:
                        negcount += 2
                        break
            i += 1

        single_review_senti_score.append(
            transform_to_positive_num(poscount, negcount))
        review_sentiment_score = sumup_sentence_sentiment_score(
            single_review_senti_score)

    return review_sentiment_score
def single_review_sentiment_score(review):
	single_review_senti_score = []
	cuted_review = tp.cut_sentence_2(review)

	for sent in cuted_review:
		seg_sent = tp.segmentation(sent, 'list')
		i = 0 # word position counter
		s = 0 # sentiment word position
		poscount = 0 # count a positive word
		negcount = 0 # count a negative word

		for word in seg_sent:
		    if word in posdict:
		        poscount += 1
		        for w in seg_sent[s:i]:
		           poscount = match(w, poscount)
		        a = i + 1

		    elif word in negdict:
		        negcount += 1
		        for w in seg_sent[s:i]:
		        	negcount = match(w, negcount)
		        a = i + 1

		    # Match "!" in the review, every "!" has a weight of +2
		    elif word == "!".decode('utf8') or word == "!".decode('utf8'):
		        for w2 in seg_sent[::-1]:
		            if w2 in posdict:
		            	poscount += 2
		            	break
		            elif w2 in negdict:
		                negcount += 2
		                break                    
		    i += 1

		single_review_senti_score.append(transform_to_positive_num(poscount, negcount))
		review_sentiment_score = sumup_sentence_sentiment_score(single_review_senti_score)

	return review_sentiment_score
def seg_filter_txt(filepath, storepath):
    txtfile = open(filepath, "r")
    txtdata = txtfile.readlines()
    txtfile.close()

    review_data = tp.segmentation(txtdata[0], "list")

    stopfile = open("/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt", "r")
    stopdata1 = stopfile.readlines()
    stopdata2 = "".join(stopdata1)
    stopwords = stopdata2.decode("utf8").split("\n")
    stopfile.close()

    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != " "]
        seg_fil_result.append(fil)
        fil = []

    fil_file = open(storepath, "w")
    for word in seg_fil_result:
        fil_file.write(word.encode("utf8") + " ")
    fil_file.close()
def seg_filter_txt(filepath, storepath):
    txtfile = open(filepath, 'r')
    txtdata = txtfile.readlines()
    txtfile.close()

    review_data = tp.segmentation(txtdata[0], 'list')

    stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r')
    stopdata1 = stopfile.readlines()
    stopdata2 = ''.join(stopdata1)
    stopwords = stopdata2.decode('utf8').split('\n')
    stopfile.close()

    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != ' ']
        seg_fil_result.append(fil)
        fil = []

    fil_file = open(storepath, 'w')
    for word in seg_fil_result:
        fil_file.write(word.encode('utf8')+' ')
    fil_file.close()
Exemple #22
0
def get_single_sent_count(cuted_sents):
    single_review_senti_score = []
    for sent in cuted_sents:
        seg_sent = tp.segmentation(sent, 'list')
        i = 0  # word position counter
        a = 0  # sentiment word position
        poscount = 0  # count a positive word
        negcount = 0  # count a negative word

        #match 用于表示程度
        for word in seg_sent:
            if word in posdict:
                poscount += 1
                for w in seg_sent[a:i]:
                    poscount = match(w, poscount)
                a = i + 1

            elif word in negdict:
                negcount += 1
                for w in seg_sent[a:i]:
                    negcount = match(w, negcount)
                a = i + 1

            # Match "!" in the review, every "!" has a weight of +2
            elif word == "!".decode('utf8') or word == "!".decode('utf8'):
                for w2 in seg_sent[::-1]:
                    if w2 in posdict:
                        poscount += 2
                        break
                    elif w2 in negdict:
                        negcount += 2
                        break
            i += 1

        single_review_senti_score.append(
            transform_to_positive_num(poscount, negcount))
    return single_review_senti_score
posdict = tp.get_txt_data(
    "/home/hadoop/coding/Sentiment features/Sentiment dictionary features/sentiment dictionary/positive and negative dictionary/posdict.txt",
    "lines")
negdict = tp.get_txt_data(
    "/home/hadoop/coding/Sentiment features/Sentiment dictionary features/sentiment dictionary/positive and negative dictionary/negdict.txt",
    "lines")
stopwords = tp.get_txt_data('/home/hadoop/coding/stopword.txt', 'lines')
posdict.extend(negdict)

i=0
sen_cur=[]
p_center = open("/home/hadoop/建国大业客观性.txt",'w+')
for sig_re in sentiment_review:
    #sig_re='挺棒'
    flag=False
    seg_list = tp.segmentation(sig_re, 'list')
    for w in seg_list:
        if w in posdict:
            sen_cur.append(sig_re)  #主观句
            flag=True
            break
    if(flag==False):
        seg_lists=Seg(str(sig_re))#分词
        for w in seg_lists:
            if w in posdict:
                i+=1
                sen_cur.append(sig_re)  #主观句
                print w,'\t',sig_re
                flag=True
                break
Exemple #24
0
def sentiment_score_list(dataset):
    cuted_data=[]
    for cell in dataset:
        cuted_data.append(tp.cut_sentence_2(cell))
    count1=[]
    count2=[]
    for sents in cuted_data:                             # 遍历每一个评论
        for sent in sents:                               # 循环遍历评论中的每一个分句
            segtmp=tp.segmentation(sent,'list')          # 把句子分词,以列表返回
            i=0
            a=0
            poscount=0
            poscount2 = 0
            poscount3 = 0
            negcount=0
            negcount2 = 0
            negcount3 = 0
            for word in segtmp:
                if word in posdict:
                    poscount+=1
                    c=0
                    for w in segtmp[a:i]:
                        if w in mostdict:
                            poscount*=4.0
                        elif w in verydict:
                            poscount*=3.0
                        elif w in moredict:
                            poscount*=2.0
                        elif w in ishdict:
                            poscount/=2.0
                        elif w in insufficientdict:
                            poscount/=4.0
                        elif w in inversedict:
                            c+=1
                    if judgeodd(c)=='odd':
                        poscount*=-1
                        poscount2+=poscount
                        poscount=0
                        poscount3=poscount+poscount2+poscount3
                        poscount2=0
                    else:
                        poscount3=poscount+poscount2+poscount3
                        poscount=0
                    a=i+1
                elif word in negdict:
                    negcount+=1
                    d=0
                    for w in segtmp[a:i]:
                        if w in mostdict:
                            negcount*=4.0
                        elif w in verydict:
                            negcount*=3.0
                        elif w in moredict:
                            negcount*=2.0
                        elif w in ishdict:
                            negcount/=2.0
                        elif w in insufficientdict:
                            negcount/=4.0
                        elif w in inversedict:
                            d+=1
                    if judgeodd(d)=='odd':
                        negcount*=-1.0
                        negcount2+=negcount
                        negcount=0
                        negcount3=negcount+negcount2+negcount3
                        negcount2=0
                    else:
                        negcount3=negcount+negcount2+negcount3
                        negcount=0
                    a=i+1
                elif word=='|'.decode('utf-8') or word =='!'.decode('utf8'):
                    for w2 in segtmp[::-1]:
                        if w2 in posdict or negdict:
                            poscount3+=2
                            negcount3+=2
                            break
                i+=1
            pos_count=0
            neg_count=0
            if poscount3<0 and negcount3>0:
                neg_count+=negcount3-poscount3
                pos_count=0
            elif negcount3<0 and poscount3>0:
                pos_count=poscount3-negcount3
                neg_count=0
            elif negcount3<0 and poscount3<0:
                neg_count=-poscount3
                pos_count=-negcount3
            else:
                pos_count=poscount3
                neg_count=negcount3
            count1.append([pos_count,neg_count])
        count2.append(count1)
        count1=[]
    return count2
#neg_review=pickle.load(open(pos_neg_review_path+'/'+'neg_review'+'.pkl','r')) 
"""
dimension = ['500','1000','1500','2000','2500','3000']
for d in dimension:
    word_scores = create_word_scores_bigram()
    best_words = find_best_words(word_scores, int(d))

    posFeatures = pos_features(best_word_features)
    negFeatures = neg_features(best_word_features)


    train = posFeatures[174:]+negFeatures[174:]
    devtest = posFeatures[124:174]+negFeatures[124:174]
    test = posFeatures[:124]+negFeatures[:124]
    dev, tag_dev = zip(*devtest)

    print 'Feature number %f' %d
    print 'BernoulliNB`s accuracy is %f' %score(BernoulliNB())
    print 'MultinomiaNB`s accuracy is %f' %score(MultinomialNB())
    print 'LogisticRegression`s accuracy is %f' %score(LogisticRegression())
    print 'SVC`s accuracy is %f' %score(SVC())
    print 'LinearSVC`s accuracy is %f' %score(LinearSVC())
    print 'NuSVC`s accuracy is %f' %score(NuSVC())
"""  
file = '/Users/genghaiyang/git/sina_weibo_crawler/data/data10/smallS_pos_neg.csv'
blog = pd.read_csv(file, header='infer', sep=',')
#pos_review =  [tp.segmentation(blog) for blog in blog['mc'][blog.value=='p']]
neg_review =  [tp.segmentation(blog) for blog in blog['mc'][blog.value=='n']]
pos_neg_review_path = '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_blog'
#pickle.dump(pos_review, open(pos_neg_review_path+'/'+'pos_review'+'.pkl','wb'))
pickle.dump(neg_review, open(pos_neg_review_path+'/'+'neg_review'+'.pkl','wb'))
"""
dimension = ['500','1000','1500','2000','2500','3000']
for d in dimension:
    word_scores = create_word_scores_bigram()
    best_words = find_best_words(word_scores, int(d))

    posFeatures = pos_features(best_word_features)
    negFeatures = neg_features(best_word_features)


    train = posFeatures[174:]+negFeatures[174:]
    devtest = posFeatures[124:174]+negFeatures[124:174]
    test = posFeatures[:124]+negFeatures[:124]
    dev, tag_dev = zip(*devtest)

    print 'Feature number %f' %d
    print 'BernoulliNB`s accuracy is %f' %score(BernoulliNB())
    print 'MultinomiaNB`s accuracy is %f' %score(MultinomialNB())
    print 'LogisticRegression`s accuracy is %f' %score(LogisticRegression())
    print 'SVC`s accuracy is %f' %score(SVC())
    print 'LinearSVC`s accuracy is %f' %score(LinearSVC())
    print 'NuSVC`s accuracy is %f' %score(NuSVC())
"""
file = '/Users/genghaiyang/git/sina_weibo_crawler/data/data10/smallS_pos_neg.csv'
blog = pd.read_csv(file, header='infer', sep=',')
#pos_review =  [tp.segmentation(blog) for blog in blog['mc'][blog.value=='p']]
neg_review = [tp.segmentation(blog) for blog in blog['mc'][blog.value == 'n']]
pos_neg_review_path = '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_blog'
#pickle.dump(pos_review, open(pos_neg_review_path+'/'+'pos_review'+'.pkl','wb'))
pickle.dump(neg_review,
            open(pos_neg_review_path + '/' + 'neg_review' + '.pkl', 'wb'))
Exemple #27
0
def sentiment_score_list(oneblog):
    cuted_data = []
    for sen in tp.cut_sentence(oneblog):
        #print sen
        cuted_data.append(sen)
    #print 'testing..............'
    count1 = []
    count2 = []
    #for sents in cuted_data: #循环遍历每一个评论
    for sent in cuted_data:  #循环遍历评论中的每一个分句
        segtmp = tp.segmentation(sent)  #把句子进行分词,以列表的形式返回
        #segtmp =list(set(segtmp)) #去除用于的词,如果情感词出现多次,那么会被重复计算
        #print segtmp
        i = 0  #记录扫描到的词的位置
        a = 0  #记录情感词的位置
        poscount = 0  #积极词的第一次分值
        poscount2 = 0  #积极词反转后的分值
        poscount3 = 0  #积极词的最后分值(包括叹号的分值)
        negcount = 0
        negcount2 = 0
        negcount3 = 0
        for word in segtmp:
            #print word,type(word),'testing...........'
            if word in posdict:  #判断词语是否是情感词
                poscount += 1
                c = 0
                for w in segtmp[a:i]:  #扫描情感词前的程度词
                    if w in mostdict:
                        poscount *= 4.0
                    elif w in verydict:
                        poscount *= 3.0
                    elif w in moredict:
                        poscount *= 2.0
                    elif w in ishdict:
                        poscount /= 2.0
                    elif w in insufficientdict:
                        poscount /= 4.0
                    elif w in inversedict:
                        c += 1
                if judgeodd(c) == 'odd':  #扫描情感词前的否定词数
                    poscount *= -1.0
                    poscount2 += poscount
                    poscount = 0
                    poscount3 = poscount + poscount2 + poscount3
                    poscount2 = 0
                else:
                    poscount3 = poscount + poscount2 + poscount3
                    poscount = 0
                a = i + 1  #情感词的位置变化
            elif word in negdict:  #消极情感的分析,与上面一致
                negcount += 1
                d = 0
                for w in segtmp[a:i]:
                    if w in mostdict:
                        negcount *= 4.0
                    elif w in verydict:
                        negcount *= 3.0
                    elif w in moredict:
                        negcount *= 2.0
                    elif w in ishdict:
                        negcount /= 2.0
                    elif w in insufficientdict:
                        negcount /= 4.0
                    elif w in inversedict:
                        d += 1
                if judgeodd(d) == 'odd':
                    negcount *= -1.0
                    negcount2 += negcount
                    negcount = 0
                    negcount3 = negcount + negcount2 + negcount3
                    negcount2 = 0
                else:
                    negcount3 = negcount + negcount2 + negcount3
                    negcount = 0
                a = i + 1
            elif word == '!'.decode('utf8') or word == '!'.decode(
                    'utf8'):  ##判断句子是否有感叹号
                for w2 in segtmp[::-1]:  #扫描感叹号前的情感词,发现后权值+2,然后退出循环
                    if w2 in posdict or negdict:
                        poscount3 += 2
                        negcount3 += 2
                        break
            i += 1  #扫描词位置前移

        #print pos_count,neg_count,'testing...................'
    #以下是防止出现负数的情况
        pos_count = 0
        neg_count = 0
        if poscount3 < 0 and negcount3 > 0:
            neg_count += negcount3 - poscount3
            pos_count = 0
        elif negcount3 < 0 and poscount3 > 0:
            pos_count = poscount3 - negcount3
            neg_count = 0
        elif poscount3 < 0 and negcount3 < 0:
            neg_count = -poscount3
            pos_count = -negcount3
        else:
            pos_count = poscount3
            neg_count = negcount3

        count1.append([pos_count, neg_count])
    count2.append(count1)
    count1 = []
    return count2
def sentiment_score_list(oneblog):
    cuted_data = []
    for sen in tp.cut_sentence(oneblog):
        #print sen
        cuted_data.append(sen)
    #print 'testing..............'
    count1 = []
    count2 = []
    #for sents in cuted_data: #循环遍历每一个评论
    for sent in cuted_data:  #循环遍历评论中的每一个分句
        segtmp = tp.segmentation(sent)  #把句子进行分词,以列表的形式返回
        #segtmp =list(set(segtmp)) #去除用于的词,如果情感词出现多次,那么会被重复计算
        #print segtmp
        i = 0 #记录扫描到的词的位置
        a = 0 #记录情感词的位置
        poscount = 0 #积极词的第一次分值
        poscount2 = 0 #积极词反转后的分值
        poscount3 = 0 #积极词的最后分值(包括叹号的分值)
        negcount = 0
        negcount2 = 0
        negcount3 = 0
        for word in segtmp:
            #print word,type(word),'testing...........'
            if word in posdict: #判断词语是否是情感词
                poscount += 1                
                c = 0
                for w in segtmp[a:i]:  #扫描情感词前的程度词
                    if w in mostdict:
                        poscount *= 4.0
                    elif w in verydict:
                        poscount *= 3.0
                    elif w in moredict:
                        poscount *= 2.0
                    elif w in ishdict:
                        poscount /= 2.0
                    elif w in insufficientdict:
                        poscount /= 4.0
                    elif w in inversedict:
                        c += 1
                if judgeodd(c) == 'odd': #扫描情感词前的否定词数
                    poscount *= -1.0
                    poscount2 += poscount
                    poscount = 0
                    poscount3 = poscount + poscount2 + poscount3
                    poscount2 = 0
                else:
                    poscount3 = poscount + poscount2 + poscount3
                    poscount = 0
                a = i + 1 #情感词的位置变化
            elif word in negdict: #消极情感的分析,与上面一致
                negcount += 1
                d = 0
                for w in segtmp[a:i]:
                    if w in mostdict:
                        negcount *= 4.0
                    elif w in verydict:
                        negcount *= 3.0
                    elif w in moredict:
                        negcount *= 2.0
                    elif w in ishdict:
                        negcount /= 2.0
                    elif w in insufficientdict:
                        negcount /= 4.0
                    elif w in inversedict:
                        d += 1
                if judgeodd(d) == 'odd':
                    negcount *= -1.0
                    negcount2 += negcount
                    negcount = 0
                    negcount3 = negcount + negcount2 + negcount3
                    negcount2 = 0
                else:
                    negcount3 = negcount + negcount2 + negcount3
                    negcount = 0
                a = i + 1
            elif word == '!'.decode('utf8') or word == '!'.decode('utf8'): ##判断句子是否有感叹号
                for w2 in segtmp[::-1]: #扫描感叹号前的情感词,发现后权值+2,然后退出循环
                    if w2 in posdict or negdict:
                        poscount3 += 2
                        negcount3 += 2
                        break                    
            i += 1 #扫描词位置前移

        #print pos_count,neg_count,'testing...................'
    #以下是防止出现负数的情况
        pos_count = 0
        neg_count = 0
        if poscount3 < 0 and negcount3 > 0:
            neg_count += negcount3 - poscount3
            pos_count = 0
        elif negcount3 < 0 and poscount3 > 0:
            pos_count = poscount3 - negcount3
            neg_count = 0
        elif poscount3 < 0 and negcount3 < 0:
            neg_count = -poscount3
            pos_count = -negcount3
        else:
            pos_count = poscount3
            neg_count = negcount3
            
        count1.append([pos_count, neg_count])
    count2.append(count1)
    count1 = []    
    return count2