Ejemplo n.º 1
0
def doVectoringComments():
    """
    将标注好的评论文本向量化,并存入数据库
    1. 从signedcomments表中查询标注好的评论
    2. 向量化评论
    3. 向量归一化
    3. 将归一化的向量插入vectorizedcomments表中
    """
    fr = time.time()
    # 已标注评论数据库表的操作对象
    signedCommentsHandler = SignedCommentsDbHandler()
    # 向量化评论数据库表的操作对象
    vectorizedCommentsDbHandler = VectorizedCommentsDbHandler()
    # 标注好的评论元组
    signedComments = signedCommentsHandler.queryAll()
    vectorizer = Vectorizer()
    vectorizedCommentsList = []
    # 评论向量化
    count = 0
    for signedComment in signedComments:
        count += 1
        print(count)
        vectorizedComment = vectorizer.vectoringOneComment(
            signedComment[1], signedComment[2], signedComment[3],
            signedComment[6], signedComment[8], signedComment[9])
        vectorizedCommentsList.append(vectorizedComment)
        vectorizedCommentsDbHandler.insertVectorizedComment(vectorizedComment)
    # 归一化特征向量
    # arr = normalizing(np.array(vectorizedCommentsList, float))
    # for vc in arr:
    #     vectorizedCommentsDbHandler.insertVectorizedComment(vc)
    to = time.time()
    print("用时:%f" % (to - fr))
Ejemplo n.º 2
0
def load_useful_data():

    signedCommentDbHandler = SignedCommentsDbHandler()

    spamCommentsList = list(signedCommentDbHandler.querySpam())
    notSpamCommentsList = list(signedCommentDbHandler.queryNotSpam())
    random.shuffle(spamCommentsList)
    random.shuffle(notSpamCommentsList)

    minLen = (len(spamCommentsList) <= len(notSpamCommentsList)
              and len(spamCommentsList) or len(notSpamCommentsList))
    usedDataList = spamCommentsList[:minLen] + notSpamCommentsList[:minLen]
    random.shuffle(usedDataList)

    contentList, isSpamList = [], []
    for usedData in usedDataList:
        contentList.append(usedData[3])
        isSpamList.append(usedData[10])
    result = UsefulData()
    result.content_array = np.array(contentList)
    result.isSpam_array = np.array(isSpamList)

    print(result.content_array.shape, result.isSpam_array.shape)
    print("load %d useful data." % (minLen * 2))
    return result
Ejemplo n.º 3
0
    def __init__(self):
        jieba.load_userdict(os.path.join(config.DICT_PATH, "user_defined_dict.txt"))
        jieba.load_userdict(os.path.join(config.DICT_PATH, "sogoupinyin_dict.txt"))

        signedCommentsDbHandler = SignedCommentsDbHandler()
        appleAppDbHandler = AppleAppDbHandler()

        self.__appId_rating_mean_dict = {}
        appList = appleAppDbHandler.queryAll()
        for app in appList:
            # print(app)
            self.__appId_rating_mean_dict[app[0]] = app[2]

        signedComments = signedCommentsDbHandler.queryAll()
        self.__commentId_comment_dict = {}    # {comment_id:signedComment}
        self.__userName_commentIds_dict = {}  # {user_name:[comment_id,comment_id,...]}
        self.__content_count_dict = {}        # {content:count}
        for signedComment in signedComments:
            self.__commentId_comment_dict[signedComment[1]] = signedComment
            if signedComment[8] in self.__userName_commentIds_dict.keys():
                self.__userName_commentIds_dict[signedComment[8]].append(signedComment[1])
            else:
                self.__userName_commentIds_dict[signedComment[8]] = [signedComment[1]]
            if signedComment[3] in self.__content_count_dict.keys():
                self.__content_count_dict[signedComment[3]] += 1
            else:
                self.__content_count_dict[signedComment[3]] = 1
Ejemplo n.º 4
0
def doVectoringContent():
    signedCommentsDbHandler = SignedCommentsDbHandler()
    vectorizedContentsDbHandler = VectorizedContentsDbHandler()
    commentMatrix = np.array(signedCommentsDbHandler.queryAll())
    contentMatrix = commentMatrix[:, (1, 3)]
    print(contentMatrix, contentMatrix.shape)
    result = vectoringContent(25, contentMatrix)
    for vectorizedContent in result:
        vectorizedContentsDbHandler.insertVectorizedContent(vectorizedContent)
Ejemplo n.º 5
0
def showDistributionOfRating():
    commentsList = SignedCommentsDbHandler().queryAll()
    countSpam, countNotSpam = 0., 0.
    countRatingSpam, countRatingNotSpam = [0] * 5, [0] * 5
    for comment in commentsList:
        if comment[-1] == 1:
            countSpam += 1
            countRatingSpam[comment[6] - 1] += 1
        else:
            countNotSpam += 1
            countRatingNotSpam[comment[6] - 1] += 1
    for i in range(0, 5):
        countRatingSpam[i] /= countSpam
        countRatingNotSpam[i] /= countNotSpam

    plt.figure(1)  # 创建第一个画板(figure)
    x1 = [0.8, 1.8, 2.8, 3.8, 4.8]
    plt.bar(x1, countRatingSpam, 0.4, color="#FF4040", label='spam comment')

    x2 = [1.2, 2.2, 3.2, 4.2, 5.2]
    plt.bar(x2,
            countRatingNotSpam,
            0.4,
            color="#43CD80",
            label='not spam comment')
    plt.legend(loc='upper left')
    plt.title("rating distribution")
    plt.xlabel("rating")
    plt.ylabel('persent/%')
    plt.grid(color='b', linewidth='0.2', linestyle='--', axis='y')
    plt.show()
Ejemplo n.º 6
0
def getMaxLengthOfContentLength():
    commentsList = SignedCommentsDbHandler().queryAll()
    maxLen = 0
    for comment in commentsList:
        if len(comment[3]) > maxLen:
            maxLen = len(comment[3])
    return maxLen
Ejemplo n.º 7
0
def showPersentOfSpamAndNotSpam():
    commentsList = SignedCommentsDbHandler().queryAll()
    countSpam, countNotSpam = 0., 0.
    for comment in commentsList:
        if comment[-1] == 1:
            countSpam += 1
        else:
            countNotSpam += 1
    x = [0]
    xx = [1]
    y = [countNotSpam / (countNotSpam + countSpam) * 100]
    yy = [countSpam / (countNotSpam + countSpam) * 100]
    print(y, yy)
    plt.bar(x, y, 0.4, color="#43CD80", label='not spam comment')
    plt.bar(xx, yy, 0.4, color="#FF4040", label='spam comment')
    plt.legend(loc='upper right')
    plt.xticks([0, 1], ['spam', 'notSpam'])
    plt.ylabel("persent/%")
    plt.xlabel("isOrNotSpam")
    plt.title("persent of spam and notspam comments")
    plt.axis([-1, 2, 0, 100])
    # 使用text显示数值
    for a, b in zip(x, y):
        plt.text(a, b + 1, '%.2f' % b, ha='center', va='bottom', fontsize=11)
    for a, b in zip(xx, yy):
        plt.text(a, b + 1, '%.2f' % b, ha='center', va='bottom', fontsize=11)
    plt.grid(color='b', linewidth='0.2', linestyle='--', axis='y')
    plt.show()
Ejemplo n.º 8
0
class XlsxToDb(object):
    def __init__(self):
        self.__signedCommentsDbHandler = SignedCommentsDbHandler()
        self.__appleAppHandler = AppleAppDbHandler()

    # 将input目录下某个xlsx文件的评论导入到数据库
    def executeOneApp(self, fileName):
        print(fileName)
        wb = None
        try:
            filePath = os.path.join(config.RESOURCES_PATH, 'signedComments',
                                    fileName)
            wb = load_workbook(filename=filePath)
            ws = wb.get_sheet_by_name(wb.get_sheet_names()[0])
        except Exception:
            print("未找到" + fileName + "文件")
            return
        appId = ws.cell(row=2, column=1).value
        # # 显示有多少张表
        # print( "Worksheet range(s):", wb.get_named_ranges() )
        # print( "Worksheet name(s):", wb.get_sheet_names() )
        # # 显示表名,表行数,表列数
        # print( "Work Sheet Titile:", ws.title )
        # print( "Work Sheet Rows:", ws.max_row)
        # print( "Work Sheet Cols:", ws.max_column )

        # 建立存储数据的列表
        comments_list = []

        for row in range(2, ws.max_row + 1):
            temp_list = []
            for col in range(2, 12):
                temp_list.append(ws.cell(row=row, column=col).value)
            temp_list.insert(9, appId)
            self.__signedCommentsDbHandler.insertSignedComment(temp_list)
            comments_list.append(temp_list)

        # 打印字典数据个数
        print('Total:%d' % len(comments_list))

    # 将input目录下所有已标记xlsx文件的评论导入到数据库
    def executeAllApp(self):
        appList = list(self.__appleAppHandler.queryAll())
        print(len(appList))
        for app in appList:
            self.executeOneApp(app[0] + "_" + app[1] + ".xlsx")
Ejemplo n.º 9
0
def doUpdate():

    sHandler = SignedCommentsDbHandler()
    aHandler = AppleAppDbHandler()

    appList = aHandler.queryAll()
    for appId, appName in appList:
        print(appId, appName)
        commentsList = sHandler.queryCommentsByAppId(appId)
        commentsCount = len(commentsList)
        if commentsCount == 0:
            aHandler.updateRatingMean(appId, 0.)
        else:
            totalRating = 0.
            for comment in commentsList:
                totalRating += comment[7]
            ratingMean = float(totalRating/commentsCount)
            aHandler.updateRatingMean(appId,ratingMean)
Ejemplo n.º 10
0
def showDistributionOfContentLength():
    commentsList = SignedCommentsDbHandler().queryAll()
    MaxLen = 6100
    lengthList = range(0, MaxLen)
    contentLengthOfSpam, contentLengthOfNotSpam = [0] * MaxLen, [0] * MaxLen
    for comment in commentsList:
        tempLength = len(comment[3]) < MaxLen and len(comment[3]) or MaxLen - 1
        if comment[-1] == 1:
            contentLengthOfSpam[tempLength] += 1
        else:
            contentLengthOfNotSpam[tempLength] += 1
    plt.figure(1)  # 创建第一个画板(figure)
    plt.plot(lengthList, contentLengthOfSpam, 'r-', label='spam comment')
    plt.plot(lengthList,
             contentLengthOfNotSpam,
             'g-',
             label='not spam comment')
    plt.legend(loc='upper right')
    plt.title("length of comment")
    plt.xlabel("length")
    plt.ylabel('count')
    plt.grid(color='b', linewidth='0.2', linestyle='--')
    plt.show()
Ejemplo n.º 11
0
 def __init__(self):
     self.__signedCommentsDbHandler = SignedCommentsDbHandler()
     self.__appleAppHandler = AppleAppDbHandler()
Ejemplo n.º 12
0
        to = time.time()
        print('init SpamDetecter using {}s'.format(to-fr))

    def doDetection(self, comment_id, title, content, rating, user_name, app_id):
        featuresVector = self.__vectorizer.doVectoring(
            comment_id, title, content, rating, user_name, app_id
        )
        r = self.__classifier.predict(np.array(featuresVector, dtype=np.float32).reshape(1, -1))
        p = self.__classifier.predict_proba(np.array(featuresVector, dtype=np.float32).reshape(1, -1))
        # print(r,p)
        return r,p

if __name__ == '__main__':

    spamDetecter = SpamDetecter()
    dbHandler = SignedCommentsDbHandler()
    # spamDetecter.doDetection('1111', '我是标题', '很好用啊,就是有点小卡。', 5, '绿的可能', '333206289')
    spamComments = dbHandler.querySpam()
    notSpamComments = dbHandler.queryNotSpam()
    comments = list(spamComments[:100]+notSpamComments[:100])
    random.shuffle(comments)
    for comment in comments:
        print('*'*60,'\n1.评论标题:{}\n2.评论内容:{}\n3.评分等级:{}\n4.评论者昵称:{}\n5.是否是垃圾评论:{}'.format(
            comment[2], comment[3], comment[6], comment[8], comment[10]
        ))
        predict_class,predict_proba = spamDetecter.doDetection(
            comment[1], comment[2], comment[3], comment[6], comment[8], comment[9]
        )
        print('6.预测是否是垃圾评论:{}\n7.是垃圾评论的概率{}'.format(
            predict_class, predict_proba[0][1]
        ))
Ejemplo n.º 13
0
 def __init__(self):
     # connet to database
     self.__appCommentsHandler = AppCommentsDbHandler()
     self.__appleAppHandler = AppleAppDbHandler()
     self.__signedCommentsDbHandler = SignedCommentsDbHandler()