def __init__(self,
                 h,
                 d,
                 u,
                 p,
                 c,
                 train_per=0.8,
                 spammer_per=0.1,
                 reset_dataset=False,
                 dump=True,
                 add_unknown_into_model=False,
                 file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)

        self.registerDay = MongoClient().userFeature.registerDay
        self.followCnt = MongoClient().userFeature.followCnt
        self.oriThirdFre = MongoClient().userFeature.oriThirdFre
        self.retweetFre = MongoClient().userFeature.retweetFre
        self.rvp = MongoClient().userFeature.rvp

        self.train_per = train_per
        self.spammer_per = spammer_per
        self.reset_dataset = reset_dataset
        self.dump = dump
        self.add_unknown_into_model = add_unknown_into_model
        self.file_name_appendix = file_name_appendix
Exemple #2
0
    def time_distribution():
        """
        统计时间周期分布
        :return:
        """
        sqlhelper = SqlHelper()
        res = {0: {},
               1: {},
               2: {},
               3: {},
               4: {},
               5: {},
               6: {},
               }
        for key in res.keys():
            for i in range(24):
                res[key][i] = 0

        for t in sqlhelper.select_sql('SELECT created_at FROM wblog'):
            timestamp = t[0]
            res[timestamp.weekday()][timestamp.hour] += 1

        with open('data/timestamp.txt', 'w') as my_file:
            for key in res.keys():
                for k in res[key].keys():
                    my_file.write(str(key * 24 + k) + ' ' + str(res[key][k]) + '\n')
Exemple #3
0
    def __enter__(self):
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.mdb = MongoClient().wblogFeature

        self.swblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM swblog')
        self.wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="no"')
        self.unknown = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM wblog')
        final_wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in self.swblog:
                self.swblog.append(wblogId)

        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in self.swblog:
            if uid in self.wblog:
                self.wblog.remove(uid)
        # print(len(swblog))

        for uid in self.swblog:
            if uid in self.unknown:
                self.unknown.remove(uid)
        for uid in self.wblog:
            if uid in self.unknown:
                self.unknown.remove(uid)
        return self
Exemple #4
0
 def __enter__(self):
     self.sqlhelper = SqlHelper(host=self.host,
                                db=self.db,
                                user=self.user,
                                passwd=self.passwd,
                                charset=self.charset)
     self.mdb = MongoClient().userFeature
     return self
Exemple #5
0
    def profile_complete():
        """
        统计用户的主页信息完整程度
        :return:
        """
        sqlhelper = SqlHelper()
        spammer = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"')
        normal = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="no"')

        cnt_dict = {}
        profile = MongoClient().profile.json_text
        for json_text in profile.find():
            uid = json_text['uid']
            if uid not in spammer and uid not in normal:
                continue
            cnt = 0
            try:
                for card in json_text['json_text']['cards']:
                    try:
                        cnt += len(card['card_group'])
                    except Exception as e:
                        pass
            except Exception as e:
                print('no cards %s' % uid)
            cnt_dict[uid] = cnt

        spammer_dict = {}
        spammer_cnt = 0
        normal_dict = {}
        normal_cnt = 0

        for key in cnt_dict.keys():
            if key in spammer:

                if cnt_dict[key] not in spammer_dict.keys():
                    spammer_dict[cnt_dict[key]] = 0
                spammer_dict[cnt_dict[key]] += 1
                spammer_cnt += 1
            else:
                if cnt_dict[key] not in normal_dict.keys():
                    normal_dict[cnt_dict[key]] = 0
                normal_dict[cnt_dict[key]] += 1
                normal_cnt += 1
        spammer_dict = sorted(spammer_dict.items(), key=lambda x: x[0])
        with open('data/profile_complete_spammer.txt', 'w') as my_file:
            cnt = 0
            for itm in spammer_dict:
                cnt += itm[1]
                my_file.write('%s %s\n' % (str(float(itm[0])), str(float(cnt) / spammer_cnt)))

        normal_dict = sorted(normal_dict.items(), key=lambda x: x[0])
        with open('data/profile_complete_normal.txt', 'w') as my_file:
            cnt = 0
            for itm in normal_dict:
                cnt += itm[1]
                my_file.write('%s %s\n' % (str(float(itm[0])), str(float(cnt) / normal_cnt)))
Exemple #6
0
    def interact():
        """
        统计微博评论的互动情况
        :return:
        """
        sqlhelper = SqlHelper()
        swblog = sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
        wblog = sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose')

        final_wblog = sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in swblog:
                swblog.append(wblogId)

        for wblogId in swblog:
            if wblogId in wblog:
                wblog.remove(wblogId)
        print(len(swblog) + len(wblog))

        hot = 0
        interact = 0
        hotCommentRatio = MongoClient().wblogFeature.hotCommentRatio
        commentInteractRatio = MongoClient().wblogFeature.commentInteractRatio
        for wblogId in wblog:
            try:
                a = hotCommentRatio.find_one({'wblogId': str(wblogId)})['hot_ratio']
                b = commentInteractRatio.find_one({'wblogId': str(wblogId)})['interact_ratio']
                # if float(a) != 0:
                #     hot += 1
                # if float(b) != 0:
                #     interact += 1
                if float(a) != 0 or float(b) != 0:
                    hot += 1
            except Exception as e:
                print('%s---- %s' % (str(e), str(wblogId)))
        print()
        print(hot)
        print(len(wblog))
        print(float(hot) / len(wblog))
        print()
        hot=0
        for wblogId in swblog:
            try:
                a = hotCommentRatio.find_one({'wblogId': str(wblogId)})['hot_ratio']
                b = commentInteractRatio.find_one({'wblogId': str(wblogId)})['interact_ratio']
                if float(a) != 0 or float(b) != 0:
                    hot += 1
            except Exception as e:
                print('%s---- %s' % (str(e), str(wblogId)))

        print(hot)
        print(len(swblog))
        print(float(hot) / len(swblog))
Exemple #7
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取msca必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix

        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id
        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
        #     Alkit.read_prior('prior_bak/user_train.txt', 'prior_bak/user_prior.txt')
        # self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
        #     Alkit.read_prior('prior_bak/wblog_train.txt', 'prior_bak/wblog_prior.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_train_list + self.user_prior_list
        self.all_wblog = self.wblog_train_list + self.wblog_prior_list

        self.follow_edge = {}  # {'uid': ['followeeUid']}
        self.follow_cnt = {}  # {'uid': follow count}
        self.retweet_edge = {}  # {'uid': ['wblogId']}
        self.wblog_retweet_cnt = {}  # {wblogId: retweet count}
        self.user_retweet_cnt = {}  # {uid: retweet count}
Exemple #8
0
def _set_follow_edge(user_list, all_user):
    follow_edge = {}
    sqlhelper = SqlHelper(host='localhost',
                          db='sdh',
                          user='******',
                          passwd='root',
                          charset='utf8')
    for uid in user_list:
        follow_edge[uid] = []
        for result in sqlhelper.select_sql(
                'SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid):
            uid = str(result[0])
            followeeUid = str(result[1])
            if followeeUid not in all_user:
                continue
            follow_edge[uid].append(followeeUid)
    return follow_edge
Exemple #9
0
    def count_edge():
        sqlhelper = SqlHelper()

        cnt = 0

        spammer = sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')
        for uid in spammer:
            if uid in normal:
                normal.remove(uid)
        all_user = spammer + normal
        print(len(all_user))
        for uid in all_user:
            for u in sqlhelper.select_sql('SELECT followeeUid FROM edge WHERE uid=%s' % str(uid)):
                if str(u[0]) in all_user:
                    cnt += 1
        print(cnt)
Exemple #10
0
def _set_retweet_edge(user_list, all_wblog):
    retweet_edge = {}
    sqlhelper = SqlHelper(host='localhost',
                          db='sdh',
                          user='******',
                          passwd='root',
                          charset='utf8')
    for uid in user_list:
        retweet_edge[uid] = []
        for res in sqlhelper.select_sql(
                'SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid):
            paMid = str(res[0])
            orMid = str(res[1])
            if paMid in all_wblog:
                retweet_edge[uid].append(paMid)
            if orMid in all_wblog:
                retweet_edge[uid].append(orMid)
    return retweet_edge
Exemple #11
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取CrowdTarget必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix
        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id

        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_prior_list
        self.all_wblog = self.wblog_train_list + self.wblog_prior_list

        self.mdb = MongoClient(
        ).crowd_target  # 代码原来是crowd_target,因为我数据库的名字写错了所以改成crow_target
        self.sqlhelper = SqlHelper()
Exemple #12
0
    def tongi():
        """
        各种统计
        :return:
        """
        sqlhelper = SqlHelper()
        spammer = sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')

        final_user = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"')
        for uid in final_user:
            if uid not in spammer:
                spammer.append(uid)

        for uid in spammer:
            if uid in normal:
                normal.remove(uid)
        print(len(spammer))
        print(len(normal))
Exemple #13
0
    def count_comment():
        sqlhelper = SqlHelper()

        comment = {}
        col = MongoClient().wblog.wblog
        # i = 0
        for wblogId in sqlhelper.select_sql('SELECT wblogId FROM wblog'):
            wblogId = wblogId[0]
            cnt = 0
            try:
                wblog = col.find_one({'wblogId': str(wblogId)})['json_text']
                cnt = int(wblog['comments_count'])
                # print(cnt)
            except Exception as e:
                print(e)

            if cnt not in comment.keys():
                comment[cnt] = 1
            else:
                comment[cnt] += 1
            # i += 1
            # if i == 100:
            #     break
        # cnt = []
        # for i in range(10000):
        #     cnt.append(i)
        # comment_cnt = init_dict(cnt, 0)
        #
        # calculate_cnt(comment_cnt, comment)

        write_dict_cnt_to_txt(comment, 'data\\comment_cnt.txt')
        """
        0 615501
        1 120480
        2 74059
        3 47064
        4 37356
        5 29747
        6 25166
        """

        sqlhelper.close()
Exemple #14
0
def _set_tweet_edge(user_list_split, all_wblog):
    tweet_edge = {}
    sqlhelper = SqlHelper(host='localhost',
                          db='sdh',
                          user='******',
                          passwd='root',
                          charset='utf8')
    for uid in user_list_split:
        tweet_edge[uid] = []
        for res in sqlhelper.select_sql(
                'SELECT wblogId FROM wblog WHERE uid=%s' % uid):
            wblogId = str(res[0])
            if wblogId in all_wblog:
                tweet_edge[uid].append(wblogId)
        for res in sqlhelper.select_sql(
                'SELECT wblogId FROM swblog WHERE uid=%s' % uid):
            wblogId = str(res[0])
            if wblogId in all_wblog:
                tweet_edge[uid].append(wblogId)
    return tweet_edge
Exemple #15
0
    def __init__(self, h, d, u, p, c, train_per=0.8, spam_per=0.1, reset_dataset=False, dump=True,
                 add_unknown_into_model=False,file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)

        self.commentSimilarity = MongoClient().wblogFeature.commentSimilarity
        self.sentimentSimilarity = MongoClient().wblogFeature.sentimentSimilarity
        self.commentInteractRatio = MongoClient().wblogFeature.commentInteractRatio
        self.hotCommentRatio = MongoClient().wblogFeature.hotCommentRatio

        self.train_per = train_per
        self.spam_per = spam_per
        self.reset_dataset = reset_dataset
        self.dump = dump
        self.add_unknown_into_model = add_unknown_into_model

        self.file_name_appendix = file_name_appendix
Exemple #16
0
    def setFF(self):
        """
        :return: none
        """
        col = self.mdb.followCnt
        sqlhelper = SqlHelper()
        # spammer = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"')
        # normal = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="no"')

        # cnt_dict = {}
        # profile = MongoClient().profile.json_text
        # for json_text in profile.find():
        #     uid = json_text['uid']
        #     if uid not in spammer and uid not in normal:
        #         continue
        #     cnt = 0
        #     try:
        #         for card in json_text['json_text']['cards']:
        #             try:
        #                 cnt += len(card['card_group'])
        #             except Exception as e:
        #                 pass
        #     except Exception as e:
        #         print('no cards %s' % uid)
        #     cnt_dict[uid] = cnt
        # for key in cnt_dict.keys():
        #     col.update({'uid': str(key)}, {'$set': {'profile': cnt_dict[key]}})
        #
        # followCnt = MongoClient().userFeature.followCnt
        # for user in followCnt.find():
        #     uid = user['uid']
        #     try:
        #         followee_cnt = followCnt.find_one({'uid': str(uid)})['followee_cnt']
        #         follower_cnt = followCnt.find_one({'uid': str(uid)})['follower_cnt']
        #         res = float(followee_cnt) / follower_cnt
        #         col.update({'uid': str(uid)}, {'$set': {'ff': res}})
        #     except Exception as e:
        #         print('no cards %s' % uid)

        uu = MongoClient().profile.user
        for user in uu.find():
            uid = user['uid']
            # if uid in spammer
            try:
                if uu.find_one({'uid': str(uid)
                                })['json_text']['description'] != '':
                    col.update({'uid': str(uid)}, {'$set': {'description': 1}})
                else:
                    col.update({'uid': str(uid)}, {'$set': {'description': 0}})
            except Exception as e:
                print('no cards %s' % uid)
Exemple #17
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
        self.file_name_appendix = file_name_appendix

        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(self.user_train_dict,
                                                                                       self.user_prior_dict)
        self.seed_worker = []
        for uid in self.user_train_dict.keys():
            if self.user_train_dict[uid]['label'] == '1':
                self.seed_worker.append(uid)
        self.other_worker = []
        for uid in self.user_prior_dict.keys():
            if self.user_prior_dict[uid]['label'] == '1':
                self.other_worker.append(uid)
        self.normal = []
        for uid in self.user_prior_dict.keys():
            if self.user_prior_dict[uid]['label'] == '-1':
                self.normal.append(uid)

        self.all_user = self.seed_worker + self.other_worker + self.normal

        self.follow_edge = []

        for uid in self.all_user:
            for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid):
                uid = str(result[0])
                followeeUid = str(result[1])
                if followeeUid not in self.all_user:
                    continue
                self.follow_edge.append((uid, followeeUid))
Exemple #18
0
 def works():
     """
     统计众包水军参与任务次数
     :return:
     """
     sqlhelper = SqlHelper()
     w = {}
     for res in sqlhelper.select_sql('SELECT woUid FROM works1516'):
         woUid = res[0]
         if woUid not in w:
             w[woUid] = 0
         w[woUid] += 1
     w_cnt = {}
     for woUid in w.keys():
         cnt = w[woUid]
         if cnt not in w_cnt:
             w_cnt[cnt] = 1
         w_cnt[cnt] += 1
     w_cnt = sorted(w_cnt.items(), key=lambda x: x[0])
     with open('data/works.txt', 'w') as my_file:
         my_file.write('woUid cnt\n')
         for itm in w_cnt:
             my_file.write('%s %s\n' % (str(itm[0]), str(itm[1])))
Exemple #19
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取S3MCD必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix
        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id
        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_prior_list
        self.all_wblog = self.wblog_prior_list

        self.follow_edge = {}  # {'uid': ['followeeUid']}
        self.tweet_edge = {}  # {'uid': ['wblogId']}
        self.wblog_content = {}  # {'wblogId': [content]}

        self.pattern_html = re.compile(r'<[^>]+>', re.S)
        self.pattern_tag = re.compile(r'#.+#', re.S)
Exemple #20
0
    def count_wblog():
        sqlhelper = SqlHelper()

        wblog = {}

        for user in sqlhelper.select_sql_one('SELECT uid FROM user'):
            wblog[str(user)] = 0
            tmp = sqlhelper.select_cnt('SELECT count(*) FROM swblog WHERE uid=%s' % (str(user)))
            # print(tmp)
            if tmp:
                wblog[str(user)] += int(tmp)
            tmp = sqlhelper.select_cnt('SELECT count(*) FROM wblog WHERE uid=%s' % (str(user)))
            # print(tmp)
            if tmp:
                wblog[str(user)] += int(tmp)

        write_dict_to_txt(wblog, 'data\\wblog.txt')
        """
        1751565235 42
        5136420870 0
        3106192681 24
        3203825104 0
        2126474562 8
        2324752481 57
        """

        cnt = []
        for i in range(10000):
            cnt.append(i)
        wblog_cnt = init_dict(cnt, 0)

        calculate_cnt(wblog_cnt, wblog)

        write_dict_cnt_to_txt(wblog_cnt, 'data\\wblog_cnt.txt')
        """
        0 7938
        1 532
        2 336
        3 249
        4 189
        5 169
        6 151
        """

        sqlhelper.close()
Exemple #21
0
class UserClassify(object):
    def __init__(self,
                 h,
                 d,
                 u,
                 p,
                 c,
                 train_per=0.8,
                 spammer_per=0.1,
                 reset_dataset=False,
                 dump=True,
                 add_unknown_into_model=False,
                 file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)

        self.registerDay = MongoClient().userFeature.registerDay
        self.followCnt = MongoClient().userFeature.followCnt
        self.oriThirdFre = MongoClient().userFeature.oriThirdFre
        self.retweetFre = MongoClient().userFeature.retweetFre
        self.rvp = MongoClient().userFeature.rvp

        self.train_per = train_per
        self.spammer_per = spammer_per
        self.reset_dataset = reset_dataset
        self.dump = dump
        self.add_unknown_into_model = add_unknown_into_model
        self.file_name_appendix = file_name_appendix

    def run(self):
        """
        从数据库中读取特征数据,并使用svm和lr分类

        水军占比例(max): 0.2325521503991759
        spammer_per <= 0.2325521503991759



        :return:
        """

        if not self.add_unknown_into_model:
            # 首先划分训练集用户和测试集用户
            spammer = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
            normal = self.sqlhelper.select_sql_one(
                'SELECT uid FROM normal WHERE choose="yes"')
            # unknown = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="not"')

            final_user = self.sqlhelper.select_sql_one(
                'SELECT uid FROM final_user WHERE spammer="yes"')
            """
            final_user: 3843个用户, 水军903, 非水军2940
            normal: 13906个用户, 水军和非水军未知,为此我们通过人工的方法从从这些用户中挑选了一些正常的用户,标记为choose='yes'
            spammer: 892个水军用户
    
            """
            for uid in final_user:
                if uid not in spammer:
                    spammer.append(uid)
            """
            到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
            """

            # 不知道为什么spammer和normal两个集合有重合的用户
            # 所以这里简单地将这些重合的用户都认为是spammer
            for uid in spammer:
                if uid in normal:
                    normal.remove(uid)
                # if uid in unknown:
                #     unknown.remove(uid)
            """
            到目前为止,我们得到了下面几个有用的东西
            spammer: 水军  
            normal: 正常用户
            unkonwn:还没来得及标注的未知类型用户
            """
            logging.info('原始数据水军占比例(max): %s' % (len(spammer) * 1.0 /
                                                 (len(normal) + len(spammer))))
            if self.spammer_per > len(spammer) * 1.0 / (len(normal) +
                                                        len(spammer)):
                logging.info(
                    'we don\'t have so much spammers in our datasets, we will keep original percentage'
                )
            else:
                expected_spammer_number = int(self.spammer_per * len(normal) *
                                              1.0 / (1 - self.spammer_per))
                spammer = random.sample(spammer, expected_spammer_number)

            # print(len(spammer))
            if self.reset_dataset:
                train_user_set = random.sample(
                    spammer, int(
                        len(spammer) * self.train_per)) + random.sample(
                            normal, int(len(normal) * self.train_per))
                test_user_set = list(
                    set(spammer + normal).difference(train_user_set))
                # # 第二期改进代码
                # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per))
                # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(
                #     len(normal) * train_per))+random.sample(unknown, len(unknown))
                # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown))
                # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown
            else:
                train_user_set, test_user_set = Alkit.read_dataset(
                    '../main/prior/user_train' + self.file_name_appendix +
                    '.txt', '../main/prior/user_prior' +
                    self.file_name_appendix + '.txt')

            # 输出训练集和测试集的一些信息
            logging.info('数据集总大小:%s' %
                         (len(train_user_set) + len(test_user_set)))
            logging.info('训练集大小:%s' % len(train_user_set))
            logging.info(
                '训练集中正例(spammer)大小:%s' %
                len(list(set(train_user_set).intersection(set(spammer)))))
            logging.info(
                '训练集中负例(normal)大小:%s' %
                len(list(set(train_user_set).intersection(set(normal)))))
            # logging.info('训练集中未知标签(unknown)大小:%s' % len(list(set(unknown))))
            logging.info('测试集大小:%s' % len(test_user_set))
            logging.info(
                '测试集中正例(spammer)大小:%s' %
                len(list(set(test_user_set).intersection(set(spammer)))))
            logging.info(
                '测试集中负例(normal)大小:%s' %
                len(list(set(test_user_set).intersection(set(normal)))))
            logging.info('水军占比例: %s' % (len(spammer) * 1.0 /
                                        (len(normal) + len(spammer))))
            """
            测试集参与训练,但是测试集在模型训练期间标签将按照unknown处理
            """
        else:
            raise ('we will implement this later.')

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(
            train_user_set, spammer, normal)
        train_feature, train_result = Alkit.process_data(
            feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(
            test_user_set, spammer, normal)
        test_feature, test_result = Alkit.process_data(feature_dict_data,
                                                       result_dict_data)
        logging.info('测试集数据处理完毕')
        # print(metrics.mutual_info_score(train_result, train_feature))
        # 使用svm训练并输出结果
        # logging.info('\nSVM开始训练')
        # model = SVC(class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # import minepy
        # m = minepy.MINE()
        # for i in range(7):
        #     m.compute_score(train_feature[:,i], train_result)
        #     print(m.mic())

        # 使用LR训练并输出结果
        logging.info('LR开始训练')
        model = LogisticRegression(class_weight='balanced')
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' %
                     metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' %
                     metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR输出概率形式的结果
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)

        # 将LR跑出来的两种结果保存下来,供下一步使用
        if self.dump:
            logging.info("保存结果输出到 " + '../main/prior/user_train' +
                         self.file_name_appendix + '.txt 和' +
                         '../main/prior/user_prior' + self.file_name_appendix +
                         '.txt')
            Alkit.write_prior(
                '../main/prior/user_train' + self.file_name_appendix + '.txt',
                '../main/prior/user_prior' + self.file_name_appendix + '.txt',
                train_user_set, train_result, test_user_set, test_result,
                predict_result, prp)

        # 使用Random Forest训练并输出结果
        # logging.info('\nRandom Forest开始训练')
        # model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        #
        # importances = model.feature_importances_
        # print(importances)
        #
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用RF输出概率形式的结果
        # predict_result_proba = model.predict_proba(test_feature)
        # prp = []
        # for prob in predict_result_proba:
        #     prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)
        # # 将RF跑出来的两种结果保存下来,供下一步使用
        # Alkit.write_prior('prior/user_train.txt', 'prior/user_prior.txt',
        #                   train_user_set, train_result, test_user_set, test_result, predict_result, prp)
        # return float(metrics.f1_score(test_result, predict_result))

        # feature_name = ['log_time', 'log_follower', 'log_followee', 'fre-re', 'fre', 'follow_fre', 'onehop_fre', 'rvp_ratio']
        # df = DataFrame(numpy.hstack((test_feature, test_result[:, None])),
        #                columns=feature_name + ["class"])
        # _ = seaborn.pairplot(df, vars=feature_name, hue="class", size=1.5)
        # plt.show()

        # feature_dict_data, result_dict_data = self.load_data(train_user_set + test_user_set, spammer, normal)
        # test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data)
        # logging.info('数据处理完毕')
        #
        # logging.info('\nSVM开始训练-交叉验证')
        # model = SVC(class_weight='balanced')
        # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1')
        # logging.info('训练结束')
        # logging.info(res)
        #
        # logging.info('\nLR开始训练-交叉验证')
        # model = LogisticRegression(class_weight='balanced')
        # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1')
        # logging.info('训练结束')
        # logging.info(res)

    def evalutaion(self):
        """
        评价一下
        :return:
        """
        user_train_dict, user_train_list, user_prior_dict, user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')

        spammer, spammer_prior, normal, normal_prior = Alkit.setSN(
            user_train_dict, user_prior_dict)
        scores = []
        test_result = []
        predict_result = []
        for uid in user_prior_list:
            test_result.append(float(user_prior_dict[uid]['label']))
            predict_result.append(float(user_prior_dict[uid]['prior_label']))
            scores.append(float(user_prior_dict[uid]['prior']))
        # print(float(metrics.f1_score(test_result, predict_result)))
        Evaluation.evaluation_self(scores, test_result)

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('user AP:%s' % str(ap))
        with open('../main/lr/user_ap' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/lr/user_roc' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        # top k precision
        worker_score = {}
        for i in range(len(scores)):
            worker_score[user_prior_list[i]] = scores[i]
        worker_score = sorted(worker_score.items(),
                              key=lambda im: float(im[1]),
                              reverse=True)
        with open('../main/lr/res_user_top' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('type uid score precision top_k\n')
            worker_count_now = 0
            top_k = 0
            for itm in worker_score:
                uid = itm[0]
                score = itm[1]
                if uid in spammer:
                    u_type = 'w'
                    worker_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(worker_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) +
                              ' ' + precision + ' ' + str(top_k) + '\n')

    def load_data(self, total_set, spammer, normal, unknown=None):
        """
        从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法
        :return: 特征字典数据,类别字典数据
        """
        feature_dict_data = OrderedDict()
        result_dict_data = OrderedDict()

        for uid in total_set:
            feature_dict_data[uid] = [
                Alkit.load_data_help(self.registerDay, uid, 'log_time'),
                Alkit.load_data_help(self.followCnt, uid, 'log_follower'),
                Alkit.load_data_help(self.followCnt, uid, 'log_followee'),
                Alkit.load_data_help(self.oriThirdFre, uid, 'fre'),
                Alkit.load_data_help(self.retweetFre, uid, 'follow_fre'),
                Alkit.load_data_help(self.retweetFre, uid, 'onehop_fre'),
                Alkit.load_data_help(self.rvp, uid, 'rvp_ratio')
            ]
            """
            现在我需要检查一下, 看看mongodb里这些json数据表是不是仅仅包含了normal和spammer而没有把unknown放进来?
            
             self.registerDay = MongoClient().userFeature.registerDay
                self.followCnt = MongoClient().userFeature.followCnt
                self.oriThirdFre = MongoClient().userFeature.oriThirdFre
                self.retweetFre = MongoClient().userFeature.retweetFre
                self.rvp = MongoClient().userFeature.rvp
        
            """

            # feature_dict_data[uid] = [Alkit.load_data_help(self.followCnt, uid, 'follower_cnt'),
            #                           Alkit.load_data_help(self.followCnt, uid, 'followee_cnt'),
            #                           Alkit.load_data_help(self.followCnt, uid, 'ff'),
            #                           Alkit.load_data_help(self.followCnt, uid, 'profile'),
            #                           Alkit.load_data_help(self.rvp, uid, 'discription')]

            # if uid in spammer:
            #     result_dict_data[uid] = 1
            # else:
            #     result_dict_data[uid] = -1

            # 第二期改进代码
            if uid in spammer:
                result_dict_data[uid] = 1
            elif uid in normal:
                result_dict_data[uid] = -1
            elif uid in unknown:
                result_dict_data[uid] = 0  # 这个地方是我自己添加的,对于标签未知的用户,设定其标签为0

        return feature_dict_data, result_dict_data
Exemple #22
0
class CrowdTarget():
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取CrowdTarget必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix
        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id

        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_prior_list
        self.all_wblog = self.wblog_train_list + self.wblog_prior_list

        self.mdb = MongoClient(
        ).crowd_target  # 代码原来是crowd_target,因为我数据库的名字写错了所以改成crow_target
        self.sqlhelper = SqlHelper()

    def feature_retweet_time(self):
        col = self.mdb.time
        if not col.find_one():
            logging.info('retweet_time为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        cc = MongoClient().comment.comment
        for wblogId in self.all_wblog:

            if wblogId in self.swblog:
                col.insert_one({'wblogId': wblogId, 'spammer': 'true'})
            else:
                col.insert_one({'wblogId': wblogId, 'spammer': 'false'})

            t = self.sqlhelper.select_sql_one(
                'SELECT created_at FROM wblog WHERE wblogId=%s' % str(wblogId))
            if not t:
                t = self.sqlhelper.select_sql_one(
                    'SELECT created_at FROM swblog WHERE wblogId=%s' %
                    str(wblogId))
            a = time.mktime(time.strptime(t[0], '%Y/%m/%d %H:%M:%S'))
            res = 0.0
            cnt = 0
            time_list = []
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    created_at = comment['json_text']['created_at'] + ':00'
                    if len(created_at.split('-')[0]) != 4:
                        created_at = '2017-' + created_at
                    b = time.mktime(
                        time.strptime(created_at, '%Y-%m-%d %H:%M:%S'))
                    res += b - a
                    cnt += 1
                    time_list.append(res)
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            if cnt != 0:
                col.update({'wblogId': wblogId},
                           {'$set': {
                               'mean': str(res / cnt)
                           }})

            if cnt > 3:
                col.update({'wblogId': wblogId}, {
                    '$set': {
                        'std': str(numpy.std(numpy.array(time_list), ddof=1))
                    }
                })
                col.update({'wblogId': wblogId}, {
                    '$set': {
                        'skewness': str(stats.skew(numpy.array(time_list)))
                    }
                })
                col.update({'wblogId': wblogId}, {
                    '$set': {
                        'kurtosis': str(stats.kurtosis(numpy.array(time_list)))
                    }
                })

        logging.info('feature_time finished')

    def feature_third(self):
        col = self.mdb.third
        if not col.find_one():
            logging.info('retweet_third为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        third_party = ('推兔', '好保姆', '互粉派对 ', '优推推互粉', '未通过审核应用', '互粉加加',
                       '互粉小助手', '孔明社交管理', '互粉赏金榜', '推米互粉', '多推', '互粉一族',
                       '推兔手机版', '推啊')

        cc = MongoClient().comment.comment

        for wblogId in self.all_wblog:
            cnt = 0
            third_cnt = 0
            if wblogId in self.swblog:
                col.insert_one({'wblogId': wblogId, 'spammer': 'true'})
            else:
                col.insert_one({'wblogId': wblogId, 'spammer': 'false'})

            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    source = comment['json_text']['source']
                    if source in third_party:
                        third_cnt += 1
                    cnt += 1
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            if cnt > 1:
                if cnt != 0:
                    # if third_cnt != 0:
                    #     print(wblogId)
                    #     print(float(third_cnt) / cnt)
                    col.update(
                        {'wblogId': wblogId},
                        {'$set': {
                            'third': str(float(third_cnt) / cnt)
                        }})
        # for wblogId in self.all_wblog:
        #     retweet_list = []
        #     cnt = 0
        #     try:
        #         for wid in self.sqlhelper.select_sql('SELECT wblogId FROM wblog WHERE paMid=%s' % str(wblogId)):
        #             retweet_list.append(wid[0])
        #         for wid in self.sqlhelper.select_sql('SELECT wblogId FROM wblog WHERE orMid=%s' % str(wblogId)):
        #             if wid[0] not in retweet_list:
        #                 retweet_list.append(wid[0])
        #         # print(retweet_list)
        #         # print(len(retweet_list))
        #         for wid in retweet_list:
        #             res = self.sqlhelper.select_sql_one('SELECT source FROM wblog WHERE wblogId=%s' % str(wid))
        #             if len(res) == 0:
        #                 continue
        #             source = res[0]
        #             if source in third_party:
        #                 cnt += 1
        #     except Exception as e:
        #         logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
        #
        #     if len(retweet_list) > 1:
        #         if cnt != 0:
        #             print(wblogId)
        #             print(float(cnt) / len(retweet_list))

    def feature_ur(self):
        col = self.mdb.ur
        if not col.find_one():
            logging.info('retweet_ur为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        total_user = []
        for uid in self.sqlhelper.select_sql('SELECT uid FROM spammer'):
            total_user.append(str(uid[0]))
        for uid in self.sqlhelper.select_sql('SELECT uid FROM normal'):
            if str(uid[0]) not in total_user:
                total_user.append(str(uid[0]))

        cc = MongoClient().comment.comment
        process_cnt = 0.0
        for wblogId in self.all_wblog:
            cnt = 0
            follow_cnt = 0
            if wblogId in self.swblog:
                col.insert_one({'wblogId': wblogId, 'spammer': 'true'})
            else:
                col.insert_one({'wblogId': wblogId, 'spammer': 'false'})

            poster_uid = self.sqlhelper.select_sql_first(
                'SELECT uid FROM swblog WHERE wblogId=%s' % str(wblogId))
            if poster_uid == -1:
                poster_uid = self.sqlhelper.select_sql_first(
                    'SELECT uid FROM wblog WHERE wblogId=%s' % str(wblogId))

            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    uid = comment['json_text']['user']['id']
                    if str(uid) in total_user:
                        cnt += 1
                        for followeeUid in self.sqlhelper.select_sql(
                                'SELECT followeeUid FROM edge1516 WHERE uid=%s'
                                % str(uid)):
                            if str(followeeUid[0]) == str(poster_uid):
                                follow_cnt += 1
                                break
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            process_cnt += 1.0
            print('processing:%s' % str(process_cnt / len(self.all_wblog)))

            if cnt > 1:
                if cnt != 0:
                    # if follow_cnt != 0:
                    #     print(wblogId)
                    #     print(float(follow_cnt) / cnt)
                    col.update({'wblogId': wblogId},
                               {'$set': {
                                   'ur': str(float(follow_cnt) / cnt)
                               }})

    def feature_click(self):
        col = self.mdb.click
        if not col.find_one():
            logging.info('click为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        ws = MongoClient().wblog.swblog
        ww = MongoClient().wblog.wblog
        for wblogId in self.all_wblog:
            if wblogId in self.swblog:
                pass
            else:
                wblog = ww.find_one({'wblogId': str(wblogId)})
                content = wblog['json_text']['text']
                if 'ttarticle' in content:
                    print('https:' +
                          content.split('ttarticle')[0].split(':')[-1] +
                          'ttarticle' +
                          content.split('ttarticle')[1].split('&')[0])

        for wblog in ws.find():
            content = wblog['json_text']['text']
            if 'ttarticle' in content:
                print('https:' + content.split('ttarticle')[0].split(':')[-1] +
                      'ttarticle' +
                      content.split('ttarticle')[1].split('&')[0])

    def run(self, train_per=0.8, reset_dataset=False):
        """
        从数据库中读取特征数据,并使用adaboost分类
        :return:
        """
        # 首先划分训练集微博和测试集微博
        swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
        wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM wblog_choose')

        final_wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in swblog:
                swblog.append(wblogId)

        for uid in swblog:
            if uid in wblog:
                wblog.remove(uid)

        train_wblog_set, test_wblog_set = Alkit.read_dataset(
            '../main/prior/wblog_train' + self.file_name_appendix + '.txt',
            '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # 输出训练集和测试集的一些信息
        logging.info('训练集大小:%s' % len(train_wblog_set))
        logging.info('训练集中正例(swblog)大小:%s' %
                     len(list(set(train_wblog_set).intersection(set(swblog)))))
        logging.info('训练集中负例(wblog)大小:%s' %
                     len(list(set(train_wblog_set).intersection(set(wblog)))))
        logging.info('测试集大小:%s' % len(test_wblog_set))
        logging.info('测试集中正例(swblog)大小:%s' %
                     len(list(set(test_wblog_set).intersection(set(swblog)))))
        logging.info('测试集中负例(wblog)大小:%s' %
                     len(list(set(test_wblog_set).intersection(set(wblog)))))

        # print('279 train_wblog_set \n', train_wblog_set)
        # print('279 swblog \n', swblog)
        # print('279 wblog \n', wblog)

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(
            train_wblog_set, swblog, wblog)
        # print('281 feature_dict_data ', feature_dict_data)  # [('4033482998743585', [nan, nan, nan, nan, nan]),
        # print('282 result_dict_data', result_dict_data)  # [('4033482998743585', 1), ('3914608449995325', 1),

        train_feature, train_result = Alkit.process_data(
            feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(
            test_wblog_set, swblog, wblog)
        test_feature, test_result = Alkit.process_data(feature_dict_data,
                                                       result_dict_data)
        logging.info('测试集数据处理完毕')

        # 使用ad-boost训练并输出结果
        logging.info('\nAdaBoost开始训练')
        model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,
                                                          min_samples_split=20,
                                                          min_samples_leaf=5),
                                   algorithm="SAMME",
                                   n_estimators=100,
                                   learning_rate=0.5)
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' %
                     metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' %
                     metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)
        Alkit.write_prior(
            '../main/crowd_target/wblog_train' + self.file_name_appendix +
            '.txt', '../main/crowd_target/wblog_prior' +
            self.file_name_appendix + '.txt', train_wblog_set, train_result,
            test_wblog_set, test_result, predict_result, prp)

    def evalutaion(self):
        """
        评价一下
        :return:
        """
        wblog_train_dict, wblog_train_list, wblog_prior_dict, wblog_prior_list = \
            Alkit.read_prior('../main/crowd_target/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/crowd_target/wblog_prior' + self.file_name_appendix + '.txt')
        swblog, swblog_prior, nwblog, nwblog_prior = Alkit.setSN(
            wblog_train_dict, wblog_prior_dict)
        scores = []
        test_result = []
        predict_result = []
        for uid in wblog_prior_list:
            test_result.append(float(wblog_prior_dict[uid]['label']))
            predict_result.append(float(wblog_prior_dict[uid]['prior_label']))
            scores.append(float(wblog_prior_dict[uid]['prior']))
        Evaluation.evaluation_self(scores, test_result)

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('wblog AP:%s' % str(ap))
        with open(
                '../main/crowd_target/wblog_ap' + self.file_name_appendix +
                '.txt', 'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open(
                '../main/crowd_target/wblog_roc' + self.file_name_appendix +
                '.txt', 'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        # top k precision
        wblog_score = {}
        for i in range(len(scores)):
            wblog_score[wblog_prior_list[i]] = scores[i]
        wblog_score = sorted(wblog_score.items(),
                             key=lambda im: float(im[1]),
                             reverse=True)
        with open(
                '../main/crowd_target/res_wblog_top' +
                self.file_name_appendix + '.txt', 'w') as my_file:
            my_file.write('type wblogId score precision top_k\n')
            wblog_count_now = 0
            top_k = 0
            for itm in wblog_score:
                uid = itm[0]
                score = itm[1]
                if uid in swblog:
                    u_type = 's'
                    wblog_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(wblog_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) +
                              ' ' + precision + ' ' + str(top_k) + '\n')

    def load_data(self, total_set, swblog, wblog):
        """
        从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法
        :return: 特征字典数据,类别字典数据
        total_set=train_wblog_set, ['4033482998743585', '3914608449995325',
        swblog=swblog, ['4045047554826553', '4039829169862097',
        wblog=wblog, ['4032096583879003', '4054839190956692',
        """
        feature_dict_data = OrderedDict()
        result_dict_data = OrderedDict()

        for wblogId in total_set:
            feature_dict_data[wblogId] = [
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'mean'),
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'std'),
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'skewness'),
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'kurtosis'),
                Alkit.load_data_help_w(self.mdb.third, wblogId, 'third')
            ]

            if wblogId in swblog:
                result_dict_data[wblogId] = 1
            else:
                result_dict_data[wblogId] = -1

        # print("388 feature_dict_data\n", feature_dict_data)

        return feature_dict_data, result_dict_data
Exemple #23
0
    def sentiment():
        """
        为了画情感极性
        :return:
        """
        sqlhelper = SqlHelper()
        swblog = sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
        wblog = sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose')

        final_wblog = sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in swblog:
                swblog.append(wblogId)

        for wblogId in swblog:
            if wblogId in wblog:
                wblog.remove(wblogId)

        all_wblog = swblog + wblog
        swblog_sentiment_dict = {}
        swblog_comment_cnt = 0
        wblog_sentiment_dict = {}
        wblog_comment_cnt = 0

        # 有一些评论很短或者没有字之类的
        # 对于这些微博,不参与计算情感极性
        # 过滤的方法是分词后判断去除一个词都不剩下的文本
        stop_words = WblogFeature.get_stop_words('stop_words.txt')

        cc = MongoClient().comment.comment

        for wblogId in all_wblog:
            corpus = []
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    text = WblogFeature.remove_html(comment['json_text']['text'])
                    text = WblogFeature.remove_tag(text)
                    fenci = list(jieba.cut_for_search(text))
                    if len(fenci) == 0:
                        continue
                    # 由于jieba分词没有提供去停用词的接口,所以手动去停用词
                    stop_cnt = 0
                    for word in fenci:
                        if word in stop_words:
                            stop_cnt += 1
                    if stop_cnt == len(fenci):
                        continue
                    corpus.append(text)
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
            if wblogId in swblog:
                swblog_comment_cnt += len(corpus)
                for text in corpus:
                    sen = round(float(snownlp.SnowNLP(text).sentiments), 1)
                    if sen not in swblog_sentiment_dict.keys():
                        swblog_sentiment_dict[sen] = 0
                    swblog_sentiment_dict[sen] += 1
            else:
                wblog_comment_cnt += len(corpus)
                for text in corpus:
                    sen = round(float(snownlp.SnowNLP(text).sentiments), 1)
                    if sen not in wblog_sentiment_dict.keys():
                        wblog_sentiment_dict[sen] = 0
                    wblog_sentiment_dict[sen] += 1

        with open('swblog_sentiment.txt', 'w') as my_file:
            for key in swblog_sentiment_dict.keys():
                my_file.write(str(key) + ' ' + str(float(swblog_sentiment_dict[key]) / swblog_comment_cnt) + '\n')
        with open('wblog_sentiment.txt', 'w') as my_file:
            for key in wblog_sentiment_dict.keys():
                my_file.write(str(key) + ' ' + str(float(wblog_sentiment_dict[key]) / wblog_comment_cnt) + '\n')
Exemple #24
0
class WblogFeature:

    pattern_html = re.compile(r'<[^>]+>', re.S)
    pattern_tag = re.compile(r'#.+#', re.S)

    def __init__(self, h, d, u, p, c):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c

    def __enter__(self):
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.mdb = MongoClient().wblogFeature

        self.swblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM swblog')
        self.wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="no"')
        self.unknown = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM wblog')
        final_wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in self.swblog:
                self.swblog.append(wblogId)

        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in self.swblog:
            if uid in self.wblog:
                self.wblog.remove(uid)
        # print(len(swblog))

        for uid in self.swblog:
            if uid in self.unknown:
                self.unknown.remove(uid)
        for uid in self.wblog:
            if uid in self.unknown:
                self.unknown.remove(uid)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.sqlhelper.close()

    def setCommentSimilarity(self):
        """
        计算评论的文本相似度
        将某一条微博下面的所有评论作为语料空间,然后计算基于tf-idf的文本余弦相似度
        :return: none
        """
        col = self.mdb.commentSimilarity
        if not col.find_one():
            logging.info('commentSimilarity为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"')
        # all_wblog = swblog + wblog

        swblog = self.swblog
        wblog = self.wblog
        unknown = self.unknown
        all_wblog = swblog + wblog + unknown

        # 将“转发微博”这四个字加入了停用词表
        stop_words = WblogFeature.get_stop_words(
            os.path.dirname(os.getcwd()) + '/microblog/stop_words.txt')

        vectorizer = CountVectorizer(
            stop_words=stop_words
        )  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频

        cc = MongoClient().comment.comment

        for wblogId in all_wblog:
            corpus = []
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    text = self.remove_html(comment['json_text']['text'])
                    # 太短的文本很有可能去停用词后没有 有意义的内容,所以直接不计入计算
                    if len(text) <= 4:
                        continue
                    if wblogId in wblog:
                        text = self.remove_tag(text)
                    corpus.append(' '.join(jieba.cut_for_search(text)))
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            cos_sum = 0.0
            cos_cnt = 0
            try:
                # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
                tfidf = TfidfTransformer().fit_transform(
                    vectorizer.fit_transform(corpus))
                weight = tfidf.toarray(
                )  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
                # 计算每两条评论间的余弦相似度
                for i in range(len(weight)):
                    for j in range(len(weight)):
                        if i == j:
                            continue
                        cos_sum += WblogFeature.cos(weight[i], weight[j])
                        cos_cnt += 1
                cos_avg = cos_sum / float(cos_cnt)
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
                cos_avg = 0.0

            try:
                if wblogId in swblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'true',
                        'comment_similarity': cos_avg
                    })
                elif wblogId in wblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'false',
                        'comment_similarity': cos_avg
                    })
                elif wblogId in unknown:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'unknown',
                        'comment_similarity': cos_avg
                    })

            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
        logging.info('setCommentSimilarity finished')

    def setSentimentSimilarity(self):
        """
        计算评论文本的情感相似度
        使用snownlp(背后是朴素贝叶斯方法)来判断评论的情感,从0(消极)~1(积极)分布,然后计算其标准差
        有待改进:分类精度问题,即目前的情感分类的工具的都很笨,对于复杂一点的句式就不行了,也许用自己以前的可能更好
        :return: none
        """
        col = self.mdb.sentimentSimilarity
        if not col.find_one():
            logging.info('sentimentSimilarity为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"')
        # all_wblog = swblog + wblog

        swblog = self.swblog
        wblog = self.wblog
        unknown = self.unknown
        all_wblog = swblog + wblog + unknown

        # 有一些评论很短或者没有字之类的
        # 对于这些微博,不参与计算情感极性
        # 过滤的方法是分词后判断去除一个词都不剩下的文本
        stop_words = WblogFeature.get_stop_words(
            os.path.dirname(os.getcwd()) + '/microblog/stop_words.txt')

        cc = MongoClient().comment.comment

        for wblogId in all_wblog:
            corpus = []
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    text = self.remove_html(comment['json_text']['text'])
                    text = self.remove_tag(text)
                    fenci = list(jieba.cut_for_search(text))
                    if len(fenci) == 0:
                        continue
                    # 由于jieba分词没有提供去停用词的接口,所以手动去停用词
                    stop_cnt = 0
                    for word in fenci:
                        if word in stop_words:
                            stop_cnt += 1
                    if stop_cnt == len(fenci):
                        continue
                    corpus.append(text)
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            std = 0.0
            if len(corpus) > 3:
                sentiment_list = []
                for text in corpus:
                    sentiment_list.append(snownlp.SnowNLP(text).sentiments)
                std = numpy.std(numpy.array(sentiment_list), ddof=1)

            try:
                if wblogId in swblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'true',
                        'sentiment_similarity': std
                    })
                elif wblogId in wblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'false',
                        'sentiment_similarity': std
                    })
                elif wblogId in unknown:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'unknown',
                        'sentiment_similarity': std
                    })

            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
        logging.info('setSentimentSimilarity finished')

    def setSpamWords(self):
        """
        从众包营销微博下面的评论中抽取关键词,即tf-idf排名前十的词
        这样对于每一条微博,都能生成十维特征,每一维特征的计算方式为

        :return:
        """
        col = self.mdb.spamWords
        if not col.find_one():
            logging.info('spamWords为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"')
        # all_wblog = swblog + wblog

        swblog = self.swblog
        wblog = self.wblog
        unknown = self.unknown
        all_wblog = swblog + wblog + unknown

        # 有一些评论很短或者没有字之类的
        # 对于这些微博,不参与计算情感极性
        # 过滤的方法是分词后判断去除一个词都不剩下的文本
        stop_words = WblogFeature.get_stop_words(
            os.path.dirname(os.getcwd()) + '\\microblog\\stop_words.txt')

        cc = MongoClient().comment.comment
        pass

    def setCommentInteractRatio(self):
        """
        计算给定微博下面的评论之间的互动频率 = 与其他人互动的评论的条数 / 总评论条数
        如何确定是一条互动评论:就简单地看有没有reply_id这个字段,还有@
        :return: none
        """
        col = self.mdb.commentInteractRatio
        if not col.find_one():
            logging.info('commentInteractRatio为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"')
        # all_wblog = swblog + wblog

        swblog = self.swblog
        wblog = self.wblog
        unknown = self.unknown
        all_wblog = swblog + wblog + unknown

        cc = MongoClient().comment.comment

        for wblogId in all_wblog:
            comment_cnt = 0
            interact_cnt = 0
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    if 'reply_id' in comment['json_text'].keys():
                        interact_cnt += 1
                        continue
                    # text = comment['json_text']['text']
                    # if '>@' in text:
                    #     interact_cnt += 1
                    comment_cnt += 1
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            if comment_cnt == 0:
                interact_ratio = 0.0
            else:
                interact_ratio = float(interact_cnt) / float(comment_cnt)

            try:
                if wblogId in swblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'true',
                        'interact_ratio': interact_ratio
                    })
                elif wblogId in wblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'false',
                        'interact_ratio': interact_ratio
                    })
                elif wblogId in unknown:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'unknown',
                        'interact_ratio': interact_ratio
                    })
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

    def setHotCommentRatio(self):
        """
        计算给定微博的评论中的点赞数与评论数的比例
        :return: none
        """
        col = self.mdb.hotCommentRatio
        if not col.find_one():
            logging.info('hotCommentRatio为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"')
        # all_wblog = swblog + wblog

        swblog = self.swblog
        wblog = self.wblog
        unknown = self.unknown
        all_wblog = swblog + wblog + unknown

        cc = MongoClient().comment.comment

        for wblogId in all_wblog:
            comment_cnt = 0
            hot_cnt = 0
            try:
                for comment in cc.find({'wblogId': str(wblogId)}):
                    if comment['json_text']['like_counts'] == '':
                        comment_cnt += 1
                    else:
                        hot_cnt += int(comment['json_text']['like_counts'])
                        comment_cnt += 1
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

            if comment_cnt == 0:
                hot_ratio = 0.0
            else:
                hot_ratio = float(hot_cnt) / float(comment_cnt)

            try:
                if wblogId in swblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'true',
                        'hot_ratio': hot_ratio
                    })
                elif wblogId in wblog:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'false',
                        'hot_ratio': hot_ratio
                    })
                elif wblogId in unknown:
                    col.insert_one({
                        'wblogId': wblogId,
                        'swblog': 'unknown',
                        'hot_ratio': hot_ratio
                    })
            except Exception as e:
                logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

    @staticmethod
    def remove_html(text):
        """
        去除文本中的html
        :return: 去除html后的文本
        """
        return WblogFeature.pattern_html.sub('', text)

    @staticmethod
    def remove_tag(text):
        """
        去除文本中的标签文本
        :return: 去除标签文本后的文本
        """
        return WblogFeature.pattern_tag.sub('', text)

    @staticmethod
    def remove_html_complete(text):
        """
        去除文本中的html,并提取其中的表情符号
        :return: list[去除html后的文本,表情1,表情2...]
        """
        pass

    @staticmethod
    def get_stop_words(file_path):
        """
        读取停用词文件
        :return: 停用词list
        """
        stop_words = []
        with open(file_path, 'r', encoding='utf-8') as my_file:
            for line in my_file:
                stop_words.append(line.split('\n')[0])
        return stop_words

    @staticmethod
    def cos(vector1, vector2):
        """
        计算余弦相似度
        :param vector1:
        :param vector2:
        :return: 余弦相似度
        """
        dot_product = 0.0
        norm_a = 0.0
        norm_b = 0.0
        for a, b in zip(vector1, vector2):
            dot_product += a * b
            norm_a += a**2
            norm_b += b**2
        if norm_a == 0.0 or norm_b == 0.0:
            return 0.0
        else:
            return dot_product / ((norm_a * norm_b)**0.5)
Exemple #25
0
class WblogClassify(object):

    def __init__(self, h, d, u, p, c, train_per=0.8, spam_per=0.1, reset_dataset=False, dump=True,
                 add_unknown_into_model=False,file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)

        self.commentSimilarity = MongoClient().wblogFeature.commentSimilarity
        self.sentimentSimilarity = MongoClient().wblogFeature.sentimentSimilarity
        self.commentInteractRatio = MongoClient().wblogFeature.commentInteractRatio
        self.hotCommentRatio = MongoClient().wblogFeature.hotCommentRatio

        self.train_per = train_per
        self.spam_per = spam_per
        self.reset_dataset = reset_dataset
        self.dump = dump
        self.add_unknown_into_model = add_unknown_into_model

        self.file_name_appendix = file_name_appendix



    def run(self):
        """
        从数据库中读取特征数据,并使用svm和lr分类
        :return:
        """
        if not self.add_unknown_into_model:
            swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
            wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose')

            final_wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
            for wblogId in final_wblog:
                if wblogId not in swblog:
                    swblog.append(wblogId)

            # 不知道为什么spammer和normal两个集合有重合的用户
            # 所以这里简单地将这些重合的用户都认为是spammer
            for uid in swblog:
                if uid in wblog:
                    wblog.remove(uid)

            """
            到目前为止,我们得到了下面几个有用的东西
            swblog: 水军  
            wblog: 正常用户
            unkonwn:还没来得及标注的未知类型微博
            """

            logging.info('原始数据spam占比例(max): %s' % (len(swblog) * 1.0 / (len(wblog) + len(swblog))))
            if self.spam_per > len(swblog) * 1.0 / (len(wblog) + len(swblog)):
                logging.info('we don\'t have so much spams in our datasets, we will keep original percentage')
            else:
                expected_spam_number = int(self.spam_per * len(wblog) * 1.0 / (1 - self.spam_per))
                swblog = random.sample(swblog, expected_spam_number)

            if self.reset_dataset:
                train_wblog_set = random.sample(swblog, int(len(swblog) * self.train_per)) + random.sample(wblog, int(
                    len(wblog) * self.train_per))
                test_wblog_set = list(set(swblog + wblog).difference(train_wblog_set))
                # # 第二期改进代码
                # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per))
                # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(
                #     len(normal) * train_per))+random.sample(unknown, len(unknown))
                # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown))
                # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown
            else:
                train_wblog_set, test_wblog_set = Alkit.read_dataset(
                    '../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                    '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

            # 输出训练集和测试集的一些信息
            logging.info('总数据集大小:%s' % (len(train_wblog_set)+len(test_wblog_set)))
            logging.info('训练集大小:%s' % len(train_wblog_set))
            logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog)))))
            logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog)))))
            logging.info('测试集大小:%s' % len(test_wblog_set))
            logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog)))))
            logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog)))))
        else:
            raise ('we will implement this later.')

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(train_wblog_set, swblog, wblog)
        train_feature, train_result = Alkit.process_data(feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(test_wblog_set, swblog, wblog)
        test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data)
        logging.info('测试集数据处理完毕')

        # 使用svm训练并输出结果
        # logging.info('\nSVM开始训练')
        # model = SVC(class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR训练并输出结果
        logging.info('LR开始训练')
        model = LogisticRegression(class_weight='balanced')
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR输出概率形式的结果
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)

        # 将LR跑出来的两种结果保存下来,供下一步使用
        if self.dump:
            logging.info("保存结果输出到 " + '../main/prior/wblog_train' + self.file_name_appendix + '.txt'
                         + "和" + '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')
            Alkit.write_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                              '../main/prior/wblog_prior' + self.file_name_appendix + '.txt',
                              train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)

        # 使用Random Forest训练并输出结果
        # logging.info('\nRandom Forest开始训练')
        # model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        #
        # importances = model.feature_importances_
        # print(importances)
        #
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))
        # 使用RF输出概率形式的结果
        # predict_result_proba = model.predict_proba(test_feature)
        # prp = []
        # for prob in predict_result_proba:
        #     prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)
        # # 将RF跑出来的两种结果保存下来,供下一步使用
        # Alkit.write_prior('prior/wblog_train.txt', 'prior/wblog_prior.txt',
        #                   train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
        # return float(metrics.f1_score(test_result, predict_result))

        # feature_name = ['log_time', 'log_follower', 'log_followee', 'fre-re', 'fre', 'follow_fre', 'onehop_fre', 'rvp_ratio']
        # df = DataFrame(numpy.hstack((test_feature, test_result[:, None])),
        #                columns=feature_name + ["class"])
        # _ = seaborn.pairplot(df, vars=feature_name, hue="class", size=1.5)
        # plt.show()

        # feature_dict_data, result_dict_data = self.load_data(train_wblog_set + test_wblog_set, swblog, wblog)
        # test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data)
        # logging.info('数据处理完毕')
        #
        # logging.info('\nSVM开始训练-交叉验证')
        # model = SVC(class_weight='balanced')
        # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1')
        # logging.info('训练结束')
        # logging.info(res)
        #
        # logging.info('\nLR开始训练-交叉验证')
        # model = LogisticRegression(class_weight='balanced')
        # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1')
        # logging.info('训练结束')
        # logging.info(res)

    def evalutaion(self):
        """
        评价一下
        :return:
        """
        wblog_train_dict, wblog_train_list, wblog_prior_dict, wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')
        swblog, swblog_prior, nwblog, nwblog_prior = Alkit.setSN(wblog_train_dict, wblog_prior_dict)
        scores = []
        test_result = []
        predict_result = []
        for uid in wblog_prior_list:
            test_result.append(float(wblog_prior_dict[uid]['label']))
            predict_result.append(float(wblog_prior_dict[uid]['prior_label']))
            scores.append(float(wblog_prior_dict[uid]['prior']))
        # print(float(metrics.f1_score(test_result, predict_result)))
        Evaluation.evaluation_self(scores, test_result)

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('wblog AP:%s' % str(ap))
        with open('../main/lr/wblog_ap'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/lr/wblog_roc'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        # top k precision
        wblog_score = {}
        for i in range(len(scores)):
            wblog_score[wblog_prior_list[i]] = scores[i]
        wblog_score = sorted(wblog_score.items(), key=lambda im: float(im[1]), reverse=True)
        with open('../main/lr/res_wblog_top'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('type wblogId score precision top_k\n')
            wblog_count_now = 0
            top_k = 0
            for itm in wblog_score:
                uid = itm[0]
                score = itm[1]
                if uid in swblog:
                    u_type = 's'
                    wblog_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(wblog_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n')

    def load_data(self, total_set, swblog, wblog, unknown=None):
        """
        从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法
        :return: 特征字典数据,类别字典数据
        """
        feature_dict_data = OrderedDict()
        result_dict_data = OrderedDict()

        for wblogId in total_set:
            feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'),
                                          Alkit.load_data_help_w(self.sentimentSimilarity, wblogId,
                                                                 'sentiment_similarity'),
                                          Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'),
                                          Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')]

            # feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'),
            #                               Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'),
            #                               Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')]

            if wblogId in swblog:
                result_dict_data[wblogId] = 1
            elif wblogId in wblog:
                result_dict_data[wblogId] = -1
            elif wblogId in unknown:
                result_dict_data[wblogId] = 0

        return feature_dict_data, result_dict_data
Exemple #26
0
class MSCA(object):
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取msca必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix

        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id
        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
        #     Alkit.read_prior('prior_bak/user_train.txt', 'prior_bak/user_prior.txt')
        # self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
        #     Alkit.read_prior('prior_bak/wblog_train.txt', 'prior_bak/wblog_prior.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_train_list + self.user_prior_list
        self.all_wblog = self.wblog_train_list + self.wblog_prior_list

        self.follow_edge = {}  # {'uid': ['followeeUid']}
        self.follow_cnt = {}  # {'uid': follow count}
        self.retweet_edge = {}  # {'uid': ['wblogId']}
        self.wblog_retweet_cnt = {}  # {wblogId: retweet count}
        self.user_retweet_cnt = {}  # {uid: retweet count}

    def loadFollowRelationship(self, workers=8):
        """
        读取用户间的关注关系
        :return: none
        """
        # 读取用户间关注关系
        # 注意spammer关注normal的边需要去除
        # 去除包括user_train里面的这种边 以及 user_prior里面的这种边(user_prior里面根据prior_label来确定)
        logging.info('多进程读取关注关系')
        self.follow_edge = set_follow_edge(self.all_user,
                                           self.all_user,
                                           self.spammer_prior,
                                           self.normal_prior,
                                           workers=workers)

        print("注意啦!!!!!")
        len(list(self.follow_edge.keys()))
        len(self.all_user)
        import operator
        print(operator.eq(list(self.follow_edge.keys()), self.all_user))
        """
        下面一段的注视是原来的代码,因为速度太慢, 我将其改造成了上面的多进程形式
        """
        # logging.info('loading FollowRelationship')
        # for uid in self.all_user[0:8]:
        #     self.follow_edge[uid] = []
        #     for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid):
        #         uid = str(result[0])
        #         followeeUid = str(result[1])
        #         if followeeUid not in self.all_user:
        #             continue
        #         if uid in self.spammer_prior and followeeUid in self.normal_prior:
        #             continue
        #         self.follow_edge[uid].append(followeeUid)
        # print('180 ', self.follow_edge)
        #
        # import operator
        # print(operator.eq(follow_edge,self.follow_edge))
        # print(follow_edge)
        # print(len(follow_edge))
        # print(self.follow_edge)
        # print(len(self.follow_edge))

        # 统计每个用户的关注数,方便后面的计算
        # 这里就统计这三千多个用户中的,就不统计总的粉丝数了
        for uid in self.all_user:
            self.follow_cnt[uid] = 0
        for uid in self.follow_edge.keys():
            self.follow_cnt[uid] += len(self.follow_edge[uid])

        logging.info('多进程读取关注关系处理结束!')

    def loadRetweetRelationship(self, workers=8):
        """
        读取用户与微博间的转发关系 以及 微博的转发数 和 用户的转发数
        :return: none
        """
        # 读取转发关系
        # 注意除了wblog表中三个月的微博数据外,还需要考虑spammer对于swblog的转发
        # 本来想根据提交的众包任务来确定spammer与swblog的转发关系的,但是刚发现不行,不行的原因有两点:
        # 1.mission表中没有wblogId,只有微博短id,无法匹配,好像我之前确定swblog的wblogId的时候是一条条人工记录的
        # 2.有一些水军提交任务的时候是浑水摸鱼的,可能啥都没干,也可能贴的错误的回复
        # 所以换一种方法
        # 之前爬评论的时候专门爬取了swblog的评论,就将评论了swblog的用户全部当做转发了
        logging.info('多进程读取转发关系')  # 3884个用户全部处理完大概需要30min
        self.retweet_edge = set_retweet_edge(self.all_user,
                                             self.all_wblog,
                                             workers=workers)
        """
        下面一段的注视是原来的代码,因为速度太慢, 我将其改造成了上面的多进程形式
        """
        # logging.info('non-process!')
        # uid_count = 0
        # for uid in self.all_user[0:80]:
        #     # for uid in all_user_sample:
        #     uid_count = uid_count + 1
        #     if uid_count % 500 == 0:
        #         logging.info("outerloop: {}/{}={}%".format(str(uid_count), str(len(self.all_user)),
        #                                                    str(100.0 * uid_count / len(self.all_user))))
        #     self.retweet_edge[uid] = []
        #     for res in self.sqlhelper.select_sql('SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid):
        #         paMid = str(res[0])
        #         orMid = str(res[1])
        #         if paMid in self.all_wblog:
        #             self.retweet_edge[uid].append(paMid)
        #         if orMid in self.all_wblog:
        #             self.retweet_edge[uid].append(orMid)
        # import operator
        #
        # print(operator.eq(retweet_edge, self.retweet_edge))

        logging.info("retweet_edge...")
        mdb = MongoClient().comment.comment
        for wblogId in self.swblog:
            # for wblogId in sw_sample:
            for res in mdb.find({'wblogId': wblogId}):
                try:
                    uid = res['json_text']['user']['id']
                    if uid in self.retweet_edge.keys():
                        if wblogId not in self.retweet_edge[uid]:
                            self.retweet_edge[uid].append(wblogId)
                except Exception as e:
                    logging.error('%s. The wblogId is %s' % (e, str(wblogId)))

        logging.info('读取微博的转发数')
        # 读取每条微博的转发数,方便后面计算用户间的联系强度
        # print(len(self.all_wblog))
        for wblogId in self.all_wblog:
            self.wblog_retweet_cnt[wblogId] = 0
        for uid in self.retweet_edge.keys():
            for wblogId in self.retweet_edge[uid]:
                self.wblog_retweet_cnt[wblogId] += 1

        # # 下面是统计一条微博总的转发数,也即转发数会很大
        #
        # mdb1 = MongoClient().wblog.wblog
        # mdb2 = MongoClient().wblog.swblog

        # suc=0
        # fail=0
        # logging.info('测试点!')
        # for wblogId in self.all_wblog:
        #     try:
        #         wblog = mdb1.find_one({'wblogId': wblogId})
        #         self.wblog_retweet_cnt[wblogId] = int(wblog['json_text']['reposts_count'])
        #         wblog = mdb2.find_one({'wblogId': wblogId})
        #         self.wblog_retweet_cnt[wblogId] = int(wblog['json_text']['reposts_count'])
        #         suc = suc + 1
        #         print("LINE:172 | suc: ", suc, "fail: ", fail)
        #     except Exception as e:
        #         fail=fail+1
        #         # logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
        # logging.error('success %s, fail %s' %(str(suc),str(fail)))

        # mdb = MongoClient().wblog.wblog
        #
        # suc = 0
        # fail = 0
        # for wblogId in self.nwblog:
        #     try:
        #         wblog = mdb.find_one({'wblogId': wblogId})
        #         self.wblog_retweet_cnt[wblogId] = int(wblog['json_text']['reposts_count'])
        #         suc = suc + 1
        #     except Exception as e:
        #         fail = fail + 1
        #         # print("LINE:187 | suc: ", suc, "fail: ", fail)
        #         logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
        # logging.error('for wblogId in self.nwblog... success %s, fail %s' % (str(suc), str(fail)))

        # mdb = MongoClient().wblog.swblog
        #
        # suc = 0
        # fail = 0
        # for wblog in mdb.find():
        #     try:
        #         self.wblog_retweet_cnt[wblog['json_text']['id']] = wblog['json_text']['reposts_count']
        #         suc = suc + 1
        #     except Exception as e:
        #         fail = fail + 1
        #         # print("LINE:199 | suc: ", suc, "fail: ", fail)
        #         # logging.error('%s.' % e)
        # logging.error('or wblog in mdb.find():... success %s, fail %s' % (str(suc), str(fail)))

        logging.info('读取用户的转发数')
        # 同样的,读取每个用户的转发数,方便后面计算微博间的联系强度
        # 由于用户的转发数和微博的转发数的获取难度不同,后者在json里就有,前者没有,所以我就只统计这三个月的了
        for uid in self.all_user:
            self.user_retweet_cnt[uid] = len(self.retweet_edge[uid])
        logging.info('loadRetweetRelationship finished')

    def setRelationIntensity_new(self, type, target, workers=4):
        """
        计算用户间的联系强度 以及 微博间的联系强度,然后将其记录下来
        值得注意的是,不知道只统计了三个月的转发够不够,但再统计两年的转发就有点费时间了
        :return:
        """
        if type == 'user':
            # 首先生成以wblogId为key的转发边字典,{wblogId: [uid]},方便后面计算
            retweet_edge = {}
            for uid in self.retweet_edge.keys():
                for wblogId in self.retweet_edge[uid]:
                    if wblogId not in retweet_edge:
                        retweet_edge[wblogId] = []
                    if uid not in retweet_edge[wblogId]:
                        retweet_edge[wblogId].append(uid)
            retweet_cnt = self.wblog_retweet_cnt  # 微博的retweet_cnt
            retweet_cnt2 = self.user_retweet_cnt  # 用户自己的retweet_cnt
        else:
            # 如果是计算微博的联系强度的话,就直接用原本的转发关系边就行
            retweet_edge = self.retweet_edge
            retweet_cnt = self.user_retweet_cnt
            retweet_cnt2 = self.wblog_retweet_cnt

        # 然后计算用户两两间的联系强度,并生成一个用户数*用户数的方阵S
        # 因为我后面会将这个方阵写入文件中,所以就不真正生成S了
        # 对于微博同理

        compute_relation_intensity(type,
                                   target,
                                   retweet_cnt,
                                   retweet_cnt2,
                                   retweet_edge,
                                   self.file_name_appendix,
                                   workers=workers)

        # nc = 0  # 记录方阵S中不为0的元素数
        #
        # with open('../main/relation_intensity/%s' % type + self.file_name_appendix + '.txt', 'w') as my_file:
        #     for i in range(len(target)):
        #         """
        #         这个地方太浪费时间了,想一想如何通过多进程实现
        #         """
        #         id1 = target[i]
        #         for j in range(i + 1, len(target)):
        #             id2 = target[j]
        #             # 计算id1和id2之间的联系强度
        #             s = 0.0
        #             if retweet_cnt2[id1] == 0 or retweet_cnt2[id2] == 0:
        #                 s = 0.0
        #             else:
        #                 for key in retweet_edge.keys():
        #                     if len(retweet_edge[key]) == 0:
        #                         continue
        #                     if id1 in retweet_edge[key] and id2 in retweet_edge[key]:
        #                         s += 1.0 / float(retweet_cnt[key])
        #             if s != 0.0:
        #                 nc += 1
        #                 my_file.write('%s %s %s\n' % (id1, id2, str(s)))
        # logging.info('%s, nc=%s' % (type, str(nc)))

    def setRelationIntensity_old(self, type, target):
        """
        读取记录下来的用户间的联系强度 以及 微博间的联系强度
        首先保存为稀疏矩阵A,然后计算A^T*A后保存为正常矩阵,再记录到文件中
        :return:
        """
        with open(
                '../main/relation_intensity/%s' % type +
                self.file_name_appendix + '.txt', 'r') as my_file:
            row_and_column = len(my_file.readlines())
        A = sparse.lil_matrix((row_and_column, len(target)))
        with open(
                '../main/relation_intensity/%s' % type +
                self.file_name_appendix + '.txt', 'r') as my_file:
            cnt = 0
            if type == 'user':
                retweet_cnt = self.user_retweet_cnt
            else:
                retweet_cnt = self.wblog_retweet_cnt
            for line in my_file:
                line = line.split('\n')[0]
                id1 = line.split(' ')[0]
                id2 = line.split(' ')[1]
                index1 = target.index(id1)
                index2 = target.index(id2)
                ri = line.split(' ')[2]

                A[cnt, index1] = pow(float(ri) / float(retweet_cnt[id1]), 0.5)
                A[cnt, index2] = 0.0 - pow(
                    float(ri) / float(retweet_cnt[id2]), 0.5)
                cnt += 1

        logging.info('setRelationIntensity_old read file finished')
        if type == 'user':
            sparse.save_npz(
                '../main/relation_intensity/A' + self.file_name_appendix,
                A.tocoo())
            logging.info('save A finished')
        else:
            sparse.save_npz(
                '../main/relation_intensity/B' + self.file_name_appendix,
                A.tocoo())
            logging.info('save B finished')
        ATA = A.T.dot(A).tocoo()
        logging.info('setRelationIntensity_old ATA finished')
        if type == 'user':
            sparse.save_npz(
                '../main/relation_intensity/ATA' + self.file_name_appendix,
                ATA)
            logging.info('save ATA finished')
        else:
            sparse.save_npz(
                '../main/relation_intensity/BTB' + self.file_name_appendix,
                ATA)
            logging.info('save BTB finished')

    def setRelationIntensity(self, reset_dataset=False, workers=4):
        """
        reset_dataset为True的时候
        调用setRelationIntensity_new(user)和setRelationIntensity_new(wblog)

        reset_dataset为False的时候
        调用setRelationIntensity_old(user)和setRelationIntensity_old(wblog)
        :return:

        A
        B
        ATA
        BTB
        '../main/relation_intensity/user.txt'
        '../main/relation_intensity/wblog.txt'
        """
        # self.loadRetweetRelationship()
        """
        上面这个被我单独调用了, 见主程序
        """
        if reset_dataset:
            logging.info('setRelationIntensity_new------user')
            self.setRelationIntensity_new('user',
                                          self.all_user,
                                          workers=workers)
            logging.info('setRelationIntensity_new------wblog')
            self.setRelationIntensity_new('wblog',
                                          self.all_wblog,
                                          workers=workers)
            logging.info('setRelationIntensity_new------finished')

            logging.info('setRelationIntensity_old------user')
            self.setRelationIntensity_old('user', self.all_user)
            logging.info('setRelationIntensity_old------wblog')
            self.setRelationIntensity_old('wblog', self.all_wblog)
            logging.info('setRelationIntensity_old------finished')

    def setLaplacian(self):
        """
        计算拉普拉斯矩阵L,并保存进文件中
        :return: none
        """
        # 首先要计算用户的pagerank值
        # self.loadFollowRelationship()
        """
        上面这个被我单独调用了, 见主程序
        """
        logging.info('计算pagerank值')

        print("572注意啦啦啦啦!!!!!")
        import operator
        print('572', list(self.follow_edge.keys()))
        print('572', self.all_user)
        print('572', operator.eq(list(self.follow_edge.keys()), self.all_user))

        page_ranks = PRMapReduce(nodes=self.all_user,
                                 edge=self.follow_edge).page_rank()
        # 生成对角矩阵PI
        PI = sparse.lil_matrix((len(self.all_user), len(self.all_user)))
        for i in range(len(self.all_user)):
            PI[i, i] = float(page_ranks[self.all_user[i]][0])
        # 生成跳转概率矩阵P
        P = sparse.lil_matrix((len(self.all_user), len(self.all_user)))
        for uid in self.follow_edge.keys():
            for followeeUid in self.follow_edge[uid]:
                P[self.all_user.index(uid),
                  self.all_user.index(followeeUid)] = 1.0 / float(
                      self.follow_cnt[uid]) * 0.85
        for i in range(len(self.all_user)):
            for j in range(len(self.all_user)):
                P[i, j] += 0.15 * 1.0 / len(self.all_user)
        # 计算拉普拉斯矩阵L
        I = sparse.identity(len(self.all_user))
        # L = I - (PI.power(0.5) * P * PI.power(-0.5) + PI.power(-0.5) * P.T * PI.power(0.5)).dot(0.5)
        L = PI - (PI.dot(P) + P.T.dot(PI)).dot(0.5)
        L = L.tocoo()
        # 写入文件中
        sparse.save_npz(
            '../main/relation_intensity/L' + self.file_name_appendix, L)
        logging.info('setLaplacian finished')

    def setReteetMatrix(self):
        """
        设置转发矩阵R,并保存进文件中
        :return: none
        """
        # self.loadRetweetRelationship()
        """
        上面这句话被我单独调用了,见主程序
        """
        # 生成转发矩阵R
        R = sparse.lil_matrix((len(self.all_user), len(self.all_wblog)))
        for uid in self.retweet_edge.keys():
            for wblogId in self.retweet_edge[uid]:
                R[self.all_user.index(uid),
                  self.all_wblog.index(wblogId)] = 1.0
        R = R.tocoo()
        # 写入文件中
        sparse.save_npz(
            '../main/relation_intensity/R' + self.file_name_appendix, R)
        logging.info('setReteetMatrix finished')

    def run(self, lenda1, lenda2, alpha, beta, gamma, theta, iteration_limit,
            change_limit):
        """
        跑MSCA算法
        :return:
        """
        # 首先确定x和y向量
        li = []
        for uid in self.user_train_list:
            li.append(float(self.user_train_dict[uid]['label']))
        for uid in self.user_prior_list:
            li.append(float(self.user_prior_dict[uid]['prior']))
            # li.append(-1)
        self.x_p = numpy.array(li)
        logging.info('user num: %s' % str(len(li)))
        li = []
        for wblogId in self.wblog_train_list:
            li.append(float(self.wblog_train_dict[wblogId]['label']))
        for wblogId in self.wblog_prior_list:
            li.append(float(self.wblog_prior_dict[wblogId]['prior']))
            # li.append(-1)
        self.y_p = numpy.array(li)
        logging.info('wblog num: %s' % str(len(li)))

        # 载入转发矩阵
        self.R = sparse.load_npz('../main/relation_intensity/R' +
                                 self.file_name_appendix + '.npz')
        # 然后需要分别计算x和y迭代时的逆矩阵
        logging.info('计算迭代x时的逆矩阵')
        self.I1 = sparse.identity(len(self.all_user))
        self.ATA = sparse.load_npz('../main/relation_intensity/ATA' +
                                   self.file_name_appendix + '.npz')
        self.L = sparse.load_npz('../main/relation_intensity/L' +
                                 self.file_name_appendix + '.npz')
        logging.info('计算迭代y时的逆矩阵')
        self.I2 = sparse.identity(len(self.all_wblog))
        self.BTB = sparse.load_npz('../main/relation_intensity/BTB' +
                                   self.file_name_appendix + '.npz')
        self.A = sparse.load_npz('../main/relation_intensity/A' +
                                 self.file_name_appendix + '.npz')
        self.B = sparse.load_npz('../main/relation_intensity/B' +
                                 self.file_name_appendix + '.npz')

        # # 首先确定x和y向量
        # li = []
        # for uid in self.user_train_list:
        #     li.append(float(self.user_train_dict[uid]['label']))
        # for uid in self.user_prior_list:
        #     li.append(float(self.user_prior_dict[uid]['prior_label']))
        # x_p = numpy.array(li)
        # logging.info('user num: %s' % str(len(li)))
        # li = []
        # for wblogId in self.wblog_train_list:
        #     li.append(float(self.wblog_train_dict[wblogId]['label']))
        # for wblogId in self.wblog_prior_list:
        #     li.append(float(self.wblog_prior_dict[wblogId]['prior_label']))
        # y_p = numpy.array(li)
        # logging.info('wblog num: %s' % str(len(li)))
        #
        # # 载入转发矩阵
        # R = sparse.load_npz('relation_intensity\\R.npz')
        #
        # # 然后需要分别计算x和y迭代时的逆矩阵
        # logging.info('计算迭代x时的逆矩阵')
        # I1 = sparse.identity(len(self.all_user))
        # ATA = sparse.load_npz('relation_intensity\\ATA.npz')
        # L = sparse.load_npz('relation_intensity\\L.npz')
        # xm = I1.dot(2.0 * lenda1) + ATA.dot(2.0 * alpha) + L.dot(2.0 * theta)
        # xm = linalg.inv(xm.toarray())
        # logging.info('计算迭代y时的逆矩阵')
        # I2 = sparse.identity(len(self.all_wblog))
        # BTB = sparse.load_npz('relation_intensity\\BTB.npz')
        # ym = I2.dot(2.0 * lenda2) + BTB.dot(2.0 * beta)
        # ym = linalg.inv(ym.toarray())
        #
        # A = sparse.load_npz('relation_intensity\\A.npz')
        # B = sparse.load_npz('relation_intensity\\B.npz')

        li = []
        for uid in self.all_user:
            li.append(0.0)
        w_o = numpy.array(li)
        C = sparse.lil_matrix((len(self.all_user), len(self.all_user)))
        for i in range(len(self.user_train_list)):
            C[i, i] = float(1.0)
        li = []
        for uid in self.user_train_list:
            li.append(float(self.user_train_dict[uid]['label']))
        for uid in self.user_prior_list:
            li.append(0.0)
        u = numpy.array(li)
        luo_x = 20.05
        xm = self.I1.dot(2.0 * lenda1) + self.ATA.dot(
            2.0 * alpha) + self.L.dot(2.0 * theta) + C.T.dot(C).dot(luo_x)

        # xm = self.I1.dot(2.0 * lenda1) + self.ATA.dot(2.0 * alpha) + self.L.dot(2.0 * theta)
        xm = linalg.inv(xm.toarray())

        li = []
        for wblogId in self.all_wblog:
            li.append(0.0)
        m_o = numpy.array(li)
        D = sparse.lil_matrix((len(self.all_wblog), len(self.all_wblog)))
        for i in range(len(self.wblog_train_list)):
            D[i, i] = float(1.0)
        li = []
        for wblogId in self.wblog_train_list:
            li.append(float(self.wblog_train_dict[wblogId]['label']))
        for wblogId in self.wblog_prior_list:
            li.append(0.0)
        m = numpy.array(li)
        luo_y = 5.05749
        ym = self.I2.dot(2.0 * lenda2) + self.BTB.dot(
            2.0 * beta) + D.T.dot(D).dot(luo_y)

        # ym = self.I2.dot(2.0 * lenda2) + self.BTB.dot(2.0 * beta)
        ym = linalg.inv(ym.toarray())

        # 开始迭代
        logging.info('开始迭代')
        iteration = 0
        x = self.x_p
        y = self.y_p
        cnt1 = 0
        cnt2 = 0
        while True:
            iteration += 1
            logging.info('iteration: %s' % str(iteration))
            if iteration > iteration_limit:
                break
            self.getFun(lenda1, lenda2, alpha, beta, gamma, theta, x, self.x_p,
                        y, self.y_p, self.A, self.B, self.R, self.L)

            iteration_x = 0
            w = w_o
            tmp = x
            while True:
                iteration_x += 1
                if iteration_x > 1000:
                    break
                x_next = xm.dot(
                    self.x_p.dot(2 * lenda1) + self.R.dot(gamma).dot(y) +
                    C.T.dot(u).dot(luo_x) - C.T.dot(w))
                w_next = w + C.dot(x_next) - u
                change = self.getChange(tmp, x_next, w, w_next)
                tmp = x_next
                w = w_next
                # print(change)
                if change <= change_limit:
                    break
                cnt1 += 1
            # x_next = xm.dot(self.x_p.dot(2 * lenda1) + self.R.dot(gamma).dot(y))

            iteration_y = 0
            w = m_o
            tmp = y
            while True:
                iteration_y += 1
                if iteration_y > 100:
                    break
                y_next = ym.dot(
                    self.y_p.dot(2 * lenda2) +
                    self.R.T.dot(gamma).dot(x_next) + D.T.dot(m).dot(luo_y) -
                    D.T.dot(w))
                w_next = w + D.dot(y_next) - m
                change = self.getChange(tmp, y_next, w, w_next)
                tmp = y_next
                w = w_next
                if change <= change_limit:
                    break
                cnt2 += 1

            # y_next = ym.dot(self.y_p.dot(2 * lenda2) + self.R.T.dot(gamma).dot(x_next))

            change = self.getChange(x, x_next, y, y_next)
            logging.info('change: %s' % str(change))
            if change <= change_limit:
                break
            x = x_next
            y = y_next

            # for i in range(len(self.user_train_list)):
            #     x[i] = float(self.user_train_dict[self.user_train_list[i]]['label'])
            # for i in range(len(self.wblog_train_list)):
            #     y[i] = float(self.wblog_train_dict[self.wblog_train_list[i]]['label'])

        logging.info('迭代结束')
        print(cnt1)
        print(cnt2)
        # 将结果写入文件
        numpy.savetxt('res_user' + self.file_name_appendix + '.txt', x)
        numpy.savetxt('res_wblog' + self.file_name_appendix + '.txt', y)

    def getChange(self, x, x_next, y, y_next):
        """
        计算每次迭代时的change
        :param x:
        :param x_next:
        :param y:
        :param y_next:
        :return: change
        """
        return linalg.norm(x - x_next, 1) + linalg.norm(y - y_next, 1)

    def getFun(self, lenda1, lenda2, alpha, beta, gamma, theta, x, x_p, y, y_p,
               A, B, R, L):
        """
        计算损失函数的值
        :return:
        """
        # print(pow(lenda1 * linalg.norm(x - x_p, 2), 2))
        # print(pow(lenda2 * linalg.norm(y - y_p, 2), 2))
        # print(pow(alpha * linalg.norm(A.dot(x), 2), 2))
        # print(pow(beta * linalg.norm(B.dot(y), 2), 2))
        # print(0.0 - gamma * R.T.dot(x).dot(y))
        # print(theta * L.T.dot(x).dot(x))

        res = pow(lenda1 * linalg.norm(x - x_p, 2), 2)
        res += pow(lenda2 * linalg.norm(y - y_p, 2), 2)
        res += pow(alpha * linalg.norm(A.dot(x), 2), 2)
        res += pow(beta * linalg.norm(B.dot(y), 2), 2)
        res -= gamma * R.T.dot(x).dot(y)
        res += theta * L.T.dot(x).dot(x)
        logging.info('Function loss: %s' % str(res))

    def evaluation_bak(self):
        """
        评价MSCA算法的结果
        :return:
        """
        logging.info('用户结果')
        scores = []
        cnt = 0
        with open('../main/res_user' + self.file_name_appendix + '.txt',
                  'r') as my_file:
            for line in my_file:
                score = float(line.split('\n')[0])
                if self.all_user[cnt] in self.user_prior_list:
                    scores.append(score)
                cnt += 1
        logging.info('min_score: %s, max_score: %s, len(user):%s' %
                     (str(min(scores)), str(max(scores)), str(len(scores))))
        test_result = []
        for uid in self.user_prior_list:
            test_result.append(int(self.user_prior_dict[uid]['label']))
        user_res = Evaluation.evaluation_self(scores, test_result)

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/user_roc' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

    def evaluation(self):
        """
        评价MSCA算法的结果
        :return:
        """
        logging.info('用户结果')
        scores = []
        cnt = 0
        with open('../main/res_user' + self.file_name_appendix + '.txt',
                  'r') as my_file:
            for line in my_file:
                score = float(line.split('\n')[0])
                if self.all_user[cnt] in self.user_prior_list:
                    scores.append(score)
                cnt += 1
        logging.info('min_score: %s, max_score: %s, len(user):%s' %
                     (str(min(scores)), str(max(scores)), str(len(scores))))
        test_result = []
        for uid in self.user_prior_list:
            test_result.append(int(self.user_prior_dict[uid]['label']))
        user_res = Evaluation.evaluation_self(scores, test_result)

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('user AP:%s' % str(ap))
        with open('../main/user_ap' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/user_roc' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        # top k precision
        worker_score = {}
        for i in range(len(scores)):
            worker_score[self.user_prior_list[i]] = scores[i]
        worker_score = sorted(worker_score.items(),
                              key=lambda im: float(im[1]),
                              reverse=True)
        with open('../main/res_user_top' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('type uid score precision top_k\n')
            worker_count_now = 0
            top_k = 0
            for itm in worker_score:
                uid = itm[0]
                score = itm[1]
                if uid in self.spammer:
                    u_type = 'w'
                    worker_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(worker_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) +
                              ' ' + precision + ' ' + str(top_k) + '\n')

        logging.info('微博结果')
        scores = []
        cnt = 0
        with open('../main/res_wblog' + self.file_name_appendix + '.txt',
                  'r') as my_file:
            for line in my_file:
                score = float(line.split('\n')[0])
                if self.all_wblog[cnt] in self.wblog_prior_list:
                    scores.append(score)
                cnt += 1
        logging.info('min_score: %s, max_score: %s, len(wblog):%s' %
                     (str(min(scores)), str(max(scores)), str(len(scores))))
        test_result = []
        for wblogId in self.wblog_prior_list:
            test_result.append(int(self.wblog_prior_dict[wblogId]['label']))
        wblog_res = Evaluation.evaluation_self(scores, test_result)

        # top k precision
        wblog_score = {}
        for i in range(len(scores)):
            wblog_score[self.wblog_prior_list[i]] = scores[i]
        wblog_score = sorted(wblog_score.items(),
                             key=lambda im: float(im[1]),
                             reverse=True)
        with open('../main/res_wblog_top' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('type wblogId score precision top_k\n')
            wblog_count_now = 0
            top_k = 0
            for itm in wblog_score:
                uid = itm[0]
                score = itm[1]
                if uid in self.swblog:
                    u_type = 's'
                    wblog_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(wblog_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) +
                              ' ' + precision + ' ' + str(top_k) + '\n')

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('wblog AP:%s' % str(ap))
        with open('../main/wblog_ap' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/wblog_roc' + self.file_name_appendix + '.txt',
                  'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        return user_res, wblog_res

    def show(self):
        """
        为了界面展示
        :return:
        """
        self.all_user = random.sample(self.all_user, 500)
        self.all_wblog = random.sample(self.all_wblog, 500)
        for uid in self.all_user:
            self.retweet_edge[uid] = []
            for res in self.sqlhelper.select_sql(
                    'SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid):
                paMid = str(res[0])
                orMid = str(res[1])
                if paMid in self.all_wblog:
                    self.retweet_edge[uid].append(paMid)
                if orMid in self.all_wblog:
                    self.retweet_edge[uid].append(orMid)
        mdb = MongoClient().comment.comment
        for wblogId in self.swblog:
            for res in mdb.find({'wblogId': wblogId}):
                try:
                    uid = res['json_text']['user']['id']
                    if uid in self.retweet_edge.keys():
                        if wblogId not in self.retweet_edge[uid]:
                            self.retweet_edge[uid].append(wblogId)
                except Exception as e:
                    logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
Exemple #27
0
class UserFeature:
    def __init__(self, h, d, u, p,
                 c):  # 'localhost', 'sdh', 'root', 'root', 'utf8'
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c

    def __enter__(self):
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.mdb = MongoClient().userFeature
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.sqlhelper.close()

    def arrangeFeatures(self):
        """
        将多张特征表整合为一个表,方便后面使用pandas操作
        :return:
        """
        col = self.mdb.features
        if not col.find_one():
            logging.info('features为空')
            col.create_index([('uid', pymongo.DESCENDING)], unique=True)

    def setRegisterDay(self):
        """
        设置用户的注册天数 和 log后的结果
        :return: none
        """
        col = self.mdb.registerDay
        if not col.find_one():
            logging.info('registerDay为空,设置主键为uid')
            col.create_index([('uid', pymongo.DESCENDING)], unique=True)

        # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')
        """我的修改:
        事实上,如果把choose='yes'去掉, 那么mongodb里存储的就是所有的14774个账号的了。
        """
        spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="yes"')
        unknown = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="not"')
        final_user = self.sqlhelper.select_sql_one(
            'SELECT uid FROM final_user WHERE spammer="yes"')

        for uid in final_user:
            if uid not in spammers:
                spammers.append(uid)
        """
        到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
        """
        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in spammers:
            if uid in normal:
                normal.remove(uid)
            if uid in unknown:
                unknown.remove(uid)
        """
        到目前为止,我们得到了下面几个有用的东西
        spammer: 水军  
        normal: 正常用户
        unkonwn:还没来得及标注的未知类型用户
        """
        all_user = spammers + normal + unknown

        for uid in all_user:
            try:
                for card in MongoClient().profile.json_text.find_one(
                    {'uid': str(uid)})['json_text']['cards']:
                    if 'card_group' not in card:
                        continue
                    for elem in card['card_group']:
                        if 'item_name' in elem and elem['item_name'] == u'注册时间':
                            t = float(
                                (datetime.datetime(2017, 11, 25) -
                                 datetime.datetime.strptime(
                                     elem['item_content'], '%Y-%m-%d')).days)
                            if uid in spammers:
                                col.insert_one({
                                    'uid': uid,
                                    'spammer': 'true',
                                    'register_day': t,
                                    'log_time': math.log10(t)
                                })
                            elif uid in normal:
                                col.insert_one({
                                    'uid': uid,
                                    'spammer': 'false',
                                    'register_day': t,
                                    'log_time': math.log10(t)
                                })
                            elif uid in unknown:
                                col.insert_one({
                                    'uid': uid,
                                    'spammer': 'unknown',
                                    'register_day': t,
                                    'log_time': math.log10(t)
                                })
                            break
            except Exception as e:
                logging.error('%s. The user is %s' % (e, str(uid)))
        logging.info('setRegisterDay finished')

    def setFollowCnt(self):
        """
        设置用户的关注数,粉丝数和 log 后的结果
        :return: none
        """
        col = self.mdb.followCnt
        if not col.find_one():
            logging.info('followCnt为空,设置主键为uid')
            col.create_index([('uid', pymongo.DESCENDING)], unique=True)

        # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')

        spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="yes"')
        unknown = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="not"')
        final_user = self.sqlhelper.select_sql_one(
            'SELECT uid FROM final_user WHERE spammer="yes"')

        for uid in final_user:
            if uid not in spammers:
                spammers.append(uid)
        """
        到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
        """
        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in spammers:
            if uid in normal:
                normal.remove(uid)
            if uid in unknown:
                unknown.remove(uid)
        """
        到目前为止,我们得到了下面几个有用的东西
        spammer: 水军  
        normal: 正常用户
        unkonwn:还没来得及标注的未知类型用户
        """

        for user in MongoClient().profile.user.find():
            uid = user['uid']
            try:
                if uid in spammers:
                    col.insert_one({
                        'uid':
                        uid,
                        'spammer':
                        'true',
                        'followee_cnt':
                        user['json_text']['follow_count'],
                        'log_followee':
                        math.log10(int(user['json_text']['follow_count'] +
                                       1.0)),
                        'follower_cnt':
                        user['json_text']['followers_count'],
                        'log_follower':
                        math.log10(
                            int(user['json_text']['followers_count'] + 1.0))
                    })
                elif uid in normal:
                    col.insert_one({
                        'uid':
                        uid,
                        'spammer':
                        'false',
                        'followee_cnt':
                        user['json_text']['follow_count'],
                        'log_followee':
                        math.log10(int(user['json_text']['follow_count'] +
                                       1.0)),
                        'follower_cnt':
                        user['json_text']['followers_count'],
                        'log_follower':
                        math.log10(
                            int(user['json_text']['followers_count'] + 1.0))
                    })
                elif uid in unknown:
                    col.insert_one({
                        'uid':
                        uid,
                        'spammer':
                        'unknown',
                        'followee_cnt':
                        user['json_text']['follow_count'],
                        'log_followee':
                        math.log10(int(user['json_text']['follow_count'] +
                                       1.0)),
                        'follower_cnt':
                        user['json_text']['followers_count'],
                        'log_follower':
                        math.log10(
                            int(user['json_text']['followers_count'] + 1.0))
                    })
            except Exception as e:
                logging.error('%s. The user is %s' % (e, str(uid)))
        logging.info('setFollowCnt finished')

    def setRVP(self):
        """
        设置用户的双向关注率
        :return: none
        """
        col = self.mdb.rvp
        if not col.find_one():
            logging.info('rvp为空,设置主键为uid')
            col.create_index([('uid', pymongo.DESCENDING)], unique=True)

        # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')
        # all_user = spammers + normal
        """我的修改:
                事实上,如果把choose='yes'去掉, 那么mongodb里存储的就是所有的14774个账号的了。
                """
        spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="yes"')
        unknown = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="not"')
        final_user = self.sqlhelper.select_sql_one(
            'SELECT uid FROM final_user WHERE spammer="yes"')

        for uid in final_user:
            if uid not in spammers:
                spammers.append(uid)
        """
        到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
        """
        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in spammers:
            if uid in normal:
                normal.remove(uid)
            if uid in unknown:
                unknown.remove(uid)
        """
        到目前为止,我们得到了下面几个有用的东西
        spammer: 水军  
        normal: 正常用户
        unkonwn:还没来得及标注的未知类型用户
        """
        all_user = spammers + normal + unknown

        edge = {}
        for uid in all_user:
            for result in self.sqlhelper.select_sql(
                    'SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid):
                if result[0] in edge.keys():
                    edge[result[0]].append(result[1])
                else:
                    edge[result[0]] = [result[1]]
        edge_reverse = {}
        for uid in all_user:
            for result in self.sqlhelper.select_sql(
                    'SELECT uid, followeeUid FROM edge WHERE followeeUid=%s' %
                    uid):
                if result[1] in edge_reverse.keys():
                    edge_reverse[result[1]].append(result[0])
                else:
                    edge_reverse[result[1]] = [result[0]]

        for uid in all_user:
            res = UserFeature.caculate_rvp_ratio(int(uid), edge, edge_reverse)
            try:
                if uid in spammers:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'true',
                        'rvp_ratio': str(res)
                    })
                elif uid in normal:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'false',
                        'rvp_ratio': str(res)
                    })
                elif uid in unknown:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'unknown',
                        'rvp_ratio': str(res)
                    })
            except Exception as e:
                logging.error('%s. The user is %s' % (e, str(uid)))
        logging.info('setRVP finished')

    def setOriThirdFre(self):
        """
        设置用户发布微博时使用第三方软件的频率
        :return: none
        """
        third_party = ('推兔', '好保姆', '互粉派对 ', '优推推互粉', '未通过审核应用', '互粉加加',
                       '互粉小助手', '孔明社交管理', '互粉赏金榜', '推米互粉', '多推', '互粉一族',
                       '推兔手机版', '推啊')

        col = self.mdb.oriThirdFre
        if not col.find_one():
            logging.info('oriThirdFre为空,设置主键为uid')
            col.create_index([('uid', pymongo.DESCENDING)], unique=True)

        ori_cnt = 0
        thi_cnt = 0
        ori_cnt_re = 0
        thi_cnt_re = 0
        # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')
        spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="yes"')
        unknown = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="not"')
        final_user = self.sqlhelper.select_sql_one(
            'SELECT uid FROM final_user WHERE spammer="yes"')

        for uid in final_user:
            if uid not in spammers:
                spammers.append(uid)
        """
        到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
        """
        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in spammers:
            if uid in normal:
                normal.remove(uid)
            if uid in unknown:
                unknown.remove(uid)
        """
        到目前为止,我们得到了下面几个有用的东西
        spammer: 水军  
        normal: 正常用户
        unkonwn:还没来得及标注的未知类型用户
        """

        for user in MongoClient().profile.user.find():
            uid = user['uid']
            tmp_ori_cnt = 0  # 微博数量
            tmp_thi_cnt = 0  # 第三方微博数量
            tmp_ori_cnt_re = 0  # 微博数量(去除转发微博)
            tmp_thi_cnt_re = 0  # 第三方微博数量(去除转发微博)
            for res in self.sqlhelper.select_sql(
                    'SELECT source, retweet_flag FROM wblog WHERE uid=%s' %
                    uid):
                source = res[0]
                retweet_flag = res[1]
                # 下面这个判断是为了筛选出原创微博
                if str(retweet_flag) == '0':
                    tmp_ori_cnt_re += 1
                    ori_cnt_re += 1
                    if source in third_party:
                        tmp_thi_cnt_re += 1
                        thi_cnt_re += 1
                tmp_ori_cnt += 1
                ori_cnt += 1
                if source in third_party:
                    tmp_thi_cnt += 1
                    thi_cnt += 1
            try:
                if uid in spammers:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'true',
                        'ori_cnt-re': tmp_ori_cnt_re,
                        'thi_cnt-re': tmp_thi_cnt_re,
                        'ori_cnt': tmp_ori_cnt,
                        'thi_cnt': tmp_thi_cnt
                    })
                elif uid in normal:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'false',
                        'ori_cnt-re': tmp_ori_cnt_re,
                        'thi_cnt-re': tmp_thi_cnt_re,
                        'ori_cnt': tmp_ori_cnt,
                        'thi_cnt': tmp_thi_cnt
                    })
                elif uid in unknown:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'unknown',
                        'ori_cnt-re': tmp_ori_cnt_re,
                        'thi_cnt-re': tmp_thi_cnt_re,
                        'ori_cnt': tmp_ori_cnt,
                        'thi_cnt': tmp_thi_cnt
                    })
            except Exception as e:
                print('%s. The user is %s' % (e, str(uid)))

        self.updateOriThirdFre(ori_cnt, thi_cnt, ori_cnt_re, thi_cnt_re)

    def updateOriThirdFre(self, ori_cnt, thi_cnt, ori_cnt_re, thi_cnt_re):
        """
        在setOriThirdFre中只是做了初步的统计
        所以这里需要计算出特征具体的值,并更新到mongodb中
        :return: none
        """
        col = self.mdb.oriThirdFre
        # ori_cnt = 1525387
        # thi_cnt = 47284
        # ori_cnt_re = 971792
        # thi_cnt_re = 10407

        max_ori = 0
        max_ori_re = 0
        for user in col.find():
            if user['ori_cnt'] > max_ori:
                max_ori = user['ori_cnt']
            if int(user['ori_cnt-re']) > max_ori_re:
                max_ori_re = user['ori_cnt-re']

        for user in col.find():
            if user['ori_cnt'] == 0:
                fre = float(thi_cnt) / ori_cnt
            else:
                coefficient = math.log10(user['ori_cnt'] +
                                         1.0) / math.log10(max_ori)
                fre = coefficient * (float(user['thi_cnt']) /
                                     user['ori_cnt']) + (1 - coefficient) * (
                                         float(thi_cnt) / ori_cnt)
            col.update({'uid': user['uid']}, {'$set': {'fre': fre}})

            if user['ori_cnt'] == 0:
                fre = 0
            else:
                fre = float(user['thi_cnt']) / user['ori_cnt']
            col.update({'uid': user['uid']}, {'$set': {'fre_new': fre}})

        for user in col.find():
            if user['ori_cnt-re'] == 0:
                fre_re = float(thi_cnt_re) / ori_cnt_re
            else:
                coefficient = math.log10(user['ori_cnt-re'] +
                                         1.0) / math.log10(max_ori_re)
                fre_re = coefficient * (
                    float(user['thi_cnt-re']) / user['ori_cnt-re']) + (
                        1 - coefficient) * (float(thi_cnt_re) / ori_cnt_re)
            col.update({'uid': user['uid']}, {'$set': {'fre-re': fre_re}})

    def setRetweetFre(self):
        """
        设置用户转发微博的关注比例
        :return: none
        """
        col = self.mdb.retweetFre
        if not col.find_one():
            logging.info('retweetFre为空,设置主键为uid')
            col.create_index([('uid', pymongo.DESCENDING)], unique=True)

        # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"')
        """我的修改:
                      事实上,如果把choose='yes'去掉, 那么mongodb里存储的就是所有的14774个账号的了。
                      """
        spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
        normal = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="yes"')
        unknown = self.sqlhelper.select_sql_one(
            'SELECT uid FROM normal WHERE choose="not"')
        final_user = self.sqlhelper.select_sql_one(
            'SELECT uid FROM final_user WHERE spammer="yes"')

        for uid in final_user:
            if uid not in spammers:
                spammers.append(uid)
        """
        到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
        """
        # 不知道为什么spammer和normal两个集合有重合的用户
        # 所以这里简单地将这些重合的用户都认为是spammer
        for uid in spammers:
            if uid in normal:
                normal.remove(uid)
            if uid in unknown:
                unknown.remove(uid)
        """
        到目前为止,我们得到了下面几个有用的东西
        spammer: 水军  
        normal: 正常用户
        unkonwn:还没来得及标注的未知类型用户
        """

        retweet_cnt = 0
        follow_cnt = 0
        onehop_cnt = 0
        for user in MongoClient().profile.user.find():
            uid = user['uid']
            tmp_retweet_cnt = 0
            tmp_follow_cnt = 0
            tmp_onehop_cnt = 0
            for res in self.sqlhelper.select_sql(
                    'SELECT retweet_flag, follow_flag, paMid FROM wblog WHERE uid=%s'
                    % uid):
                retweet_flag = res[0]
                follow_flag = res[1]
                paMid = res[2]
                # 下面这个判断是为了筛选出转发微博
                if str(retweet_flag) == '0':
                    continue

                tmp_retweet_cnt += 1
                retweet_cnt += 1
                if str(follow_flag) == '1':
                    tmp_follow_cnt += 1
                    follow_cnt += 1
                if str(paMid) == '0':
                    tmp_onehop_cnt += 1
                    onehop_cnt += 1
            try:
                if uid in spammers:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'true',
                        'retweet_cnt': tmp_retweet_cnt,
                        'follow_cnt': tmp_follow_cnt,
                        'onehop_cnt': tmp_onehop_cnt
                    })
                elif uid in normal:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'false',
                        'retweet_cnt': tmp_retweet_cnt,
                        'follow_cnt': tmp_follow_cnt,
                        'onehop_cnt': tmp_onehop_cnt
                    })
                elif uid in unknown:
                    col.insert_one({
                        'uid': uid,
                        'spammer': 'unknown',
                        'retweet_cnt': tmp_retweet_cnt,
                        'follow_cnt': tmp_follow_cnt,
                        'onehop_cnt': tmp_onehop_cnt
                    })

            except Exception as e:
                print('%s. The user is %s' % (e, str(uid)))
        self.updateRetweetFre(retweet_cnt, follow_cnt, onehop_cnt)

    def updateRetweetFre(self, retweet_cnt, follow_cnt, onehop_cnt):
        """
        在setRetweetFre中只是做了初步的统计
        所以这里需要计算出特征具体的值,并更新到mongodb中
        :return: none
        """
        col = self.mdb.retweetFre

        # max_retweet_cnt = 0
        # max_follow_cnt = 0
        # max_onehop_cnt = 0
        # for user in col.find():
        #     if int(user['retweet_cnt']) > max_retweet_cnt:
        #         max_retweet_cnt = user['retweet_cnt']
        #     if int(user['follow_cnt']) > max_follow_cnt:
        #         max_follow_cnt = user['follow_cnt']
        #     if int(user['onehop_cnt']) > max_onehop_cnt:
        #         max_onehop_cnt = user['onehop_cnt']
        # spammer = self.sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"')
        # 先计算转发微博的关注比例
        for user in col.find():
            fre = 0
            # if user['retweet_cnt'] == 0:
            #     fre = float(follow_cnt) / retweet_cnt
            # else:
            # coefficient = math.log10(user['retweet_cnt'] + 1.0) / math.log10(max_retweet_cnt)
            # fre = coefficient * (float(user['follow_cnt']) / user['retweet_cnt']) + (1 - coefficient) * (
            # float(follow_cnt) / retweet_cnt)
            if float(user['retweet_cnt']) != 0:
                fre = str(
                    float(user['follow_cnt']) / float(user['retweet_cnt']))
            if int(fre) == 0:
                pass
            col.update({'uid': user['uid']}, {'$set': {'follow_fre': fre}})
        # 再计算转发微博中一跳转发的比例
        for user in col.find():
            fre = 0
            # if user['retweet_cnt'] == 0:
            #     fre = float(onehop_cnt) / retweet_cnt
            # else:
            #     coefficient = math.log10(user['retweet_cnt'] + 1.0) / math.log10(max_retweet_cnt)
            #     fre = coefficient * (float(user['onehop_cnt']) / user['retweet_cnt']) + (1 - coefficient) * (
            #         float(onehop_cnt) / retweet_cnt)
            if float(user['retweet_cnt']) != 0:
                fre = str(
                    float(user['onehop_cnt']) / float(user['retweet_cnt']))
            col.update({'uid': user['uid']}, {'$set': {'onehop_fre': fre}})

    @staticmethod
    def caculate_rvp_ratio(user, edge, edge_reverse):
        reciprocated_edge = 0
        edge_total_count = 0
        if user in edge.keys():
            edge_total_count += len(edge[user])
            for followee in edge[user]:
                if followee in edge_reverse.keys():
                    if user in edge_reverse[followee]:
                        reciprocated_edge += 1
        if user in edge_reverse.keys():
            edge_total_count += len(edge_reverse[user])

        if edge_total_count == 0:
            return 0.0
        return float(reciprocated_edge) / float(edge_total_count)

    def setFF(self):
        """
        :return: none
        """
        col = self.mdb.followCnt
        sqlhelper = SqlHelper()
        # spammer = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"')
        # normal = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="no"')

        # cnt_dict = {}
        # profile = MongoClient().profile.json_text
        # for json_text in profile.find():
        #     uid = json_text['uid']
        #     if uid not in spammer and uid not in normal:
        #         continue
        #     cnt = 0
        #     try:
        #         for card in json_text['json_text']['cards']:
        #             try:
        #                 cnt += len(card['card_group'])
        #             except Exception as e:
        #                 pass
        #     except Exception as e:
        #         print('no cards %s' % uid)
        #     cnt_dict[uid] = cnt
        # for key in cnt_dict.keys():
        #     col.update({'uid': str(key)}, {'$set': {'profile': cnt_dict[key]}})
        #
        # followCnt = MongoClient().userFeature.followCnt
        # for user in followCnt.find():
        #     uid = user['uid']
        #     try:
        #         followee_cnt = followCnt.find_one({'uid': str(uid)})['followee_cnt']
        #         follower_cnt = followCnt.find_one({'uid': str(uid)})['follower_cnt']
        #         res = float(followee_cnt) / follower_cnt
        #         col.update({'uid': str(uid)}, {'$set': {'ff': res}})
        #     except Exception as e:
        #         print('no cards %s' % uid)

        uu = MongoClient().profile.user
        for user in uu.find():
            uid = user['uid']
            # if uid in spammer
            try:
                if uu.find_one({'uid': str(uid)
                                })['json_text']['description'] != '':
                    col.update({'uid': str(uid)}, {'$set': {'description': 1}})
                else:
                    col.update({'uid': str(uid)}, {'$set': {'description': 0}})
            except Exception as e:
                print('no cards %s' % uid)
Exemple #28
0
class DetectVC(object):

    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
        self.file_name_appendix = file_name_appendix

        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(self.user_train_dict,
                                                                                       self.user_prior_dict)
        self.seed_worker = []
        for uid in self.user_train_dict.keys():
            if self.user_train_dict[uid]['label'] == '1':
                self.seed_worker.append(uid)
        self.other_worker = []
        for uid in self.user_prior_dict.keys():
            if self.user_prior_dict[uid]['label'] == '1':
                self.other_worker.append(uid)
        self.normal = []
        for uid in self.user_prior_dict.keys():
            if self.user_prior_dict[uid]['label'] == '-1':
                self.normal.append(uid)

        self.all_user = self.seed_worker + self.other_worker + self.normal

        self.follow_edge = []

        for uid in self.all_user:
            for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid):
                uid = str(result[0])
                followeeUid = str(result[1])
                if followeeUid not in self.all_user:
                    continue
                self.follow_edge.append((uid, followeeUid))

    def run(self):
        """
        主要调用HITS算法,稍作修改就行
        :return: hub, auth
        """
        logging.info('compute hits')
        hub = {}
        auth = {}
        graph = HITSMapReduce(self.all_user, self.follow_edge, self.seed_worker).hits()
        for user in self.all_user:
            hub[user] = graph[user]['hub'][0]
            auth[user] = graph[user]['authority'][0]

        logging.info('用户结果')
        scores = []
        test_result = []
        for uid in self.user_prior_list:
            test_result.append(int(self.user_prior_dict[uid]['label']))
            scores.append(float(hub[uid]))
        user_res = Evaluation.evaluation_self(scores, test_result)

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('user AP:%s' % str(ap))
        with open('../main/detect_vc/user_ap'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/detect_vc/user_roc'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        # top k precision
        worker_score = {}
        for i in range(len(scores)):
            worker_score[self.user_prior_list[i]] = scores[i]
        worker_score = sorted(worker_score.items(), key=lambda im: float(im[1]), reverse=True)
        with open('../main/detect_vc/res_user_top'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('type uid score precision top_k\n')
            worker_count_now = 0
            top_k = 0
            for itm in worker_score:
                uid = itm[0]
                score = itm[1]
                if uid in self.spammer:
                    u_type = 'w'
                    worker_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(worker_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n')

        hub = sorted(hub.items(), key=lambda im: float(im[1]), reverse=True)
        with open('../main/detect_vc/hub'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('type uid hub worker_per total_per\n')
            worker_count_now = 0
            worker_count_all = len(self.other_worker)
            all_count_now = 0
            all_count_all = len(self.all_user) - len(self.seed_worker)
            for itm in hub:
                uid = str(itm[0])
                u_type = '-'
                if uid in self.seed_worker:
                    continue
                if uid in self.other_worker:
                    u_type = 'o'
                    worker_count_now += 1
                if uid in self.normal:
                    u_type = 'n'
                all_count_now += 1

                hub_score = str(itm[1])
                worker_per = str(float(worker_count_now) / worker_count_all)
                total_per = str(float(all_count_now) / all_count_all)
                my_file.write(u_type + ' ' + uid + ' ' + hub_score + ' ' + worker_per + ' ' + total_per + '\n')

        return hub, auth