コード例 #1
0
def CalculateTFIDF(usercandidate, user_id, table="StandardUsers"):

    # 获取用户的所有id
    userids = mysql.getUsersId(table)
    userNumber = len(userids)
    print "正在计算TFIDF,数据库中共计%d个用户" % (userNumber)
    # print "该用户有%d个用户,现在开始计算" % (userNumber - 1)
    tfidf = [1 for i in range(50)]
    # for i in range(100):
    #     tfidf.append(1)
    count = 0
    for userid in userids:
        if userid == user_id:
            continue
            # lines = f.readlines()
        wordsSet = GenerateWords(userid)
        id = 0
        for candidate in usercandidate:
            # 全部转换成小写
            if (candidate[0]).lower() in wordsSet:
                tfidf[id] += 1
            # except Exception as e:
            #     pass
            id += 1
        count += 1
        print count
    id = 0
    for uc in usercandidate:
        value = math.log(userNumber * 1.0 / tfidf[id]) * uc[1]
        tfidf[id] = value
        id += 1
    # tfidf = map(lambda value:math.log(value * 1.0 / userNumber) * usercandidate[key],tfidf)
    return tfidf
コード例 #2
0
def GenerateAllUsersInterestTags(table="StandardUsers"):
    users = mysql.getUsersInfo(table)
    count = 0
    loss = 0
    for user in users:
        try:
            interests = GenerateInterestsWithFollowers(user.id)
            print "%s:" % user.id
            print interests
            # 写入数据库中
            mysql.updateUserInterest(table, user.id, interests.encode('utf-8'))
        except Exception as e:
            loss += 1
            print "lose userid:%s" % user.id
            print "loss %d users" % loss
        count += 1
        print "finished %d users" % count
コード例 #3
0
def getUsers(table):
    '''

    :param table: 表名
    :return: 返回TwitterUser类对象的列表
    '''
    users = mysql.getUsersInfo(table)
    return users
コード例 #4
0
def UserProfileFromDB(userid):
    start_time = time.time()
    if mysql.checkUser(userid) == False:
        print "数据库中不存在该用户"
        return
    user = mysql.getUserInfo(userid)
    # 获取用户推文文本
    tweets = mongo.getUserTweets(userid)
    if tweets == "":
        print "mongodb中没有该用户的推文"
        return
    print "已获取推文"

    # 获取人物所属领域
    category = GetUserCategory(userid)
    user.category = category
    print "人物所属领域:%s" %  category

    # 获取人物兴趣爱好标签,两种方式
    # interests = ExtractTargetUserInterest.GenerateInterestsWithFollowers(userid)
    interests = GetUserInterestTags(userid)
    user.interest_tags = interests
    print "人物兴趣爱好标签:%s" % interests

    # 获取人物影响力分数及等级
    # rank为{1,2,3}集合中的某一元素
    influence_score,active,influ,rank = GetUserInfluence(userid)
    user.influenceScore = influence_score
    print "人物活跃度分数:%f,影响力度分数:%f,影响力分数:%f,影响力等级:%s" % (active,influ,influence_score,rank)

    # 获取人物心理状态,返回结果为最近一条推文起始时间,从起始时间向前一段时间内的心理状态序列以及近期心理状态结果,psy为{1,-1,0}
    # 后面后可以跟参数period,设置时间段的长度,单位为月
    starttime,psychological,psy = GetUserPsychology(userid)
    user.psy_tweets_starttime = starttime
    user.psy = psy
    user.psy_seq = psychological
    # psy从整形转为字符串型
    psy = config.psychological[psy]
    print "人物近期心理状态:%s" % psy

    # 生成XML文档
    xml.GenerateUserXml(user)

    end_time = time.time()
    print "用时:%f" % (end_time - start_time)
コード例 #5
0
def InsertRelsToNeoFromMysql(table="relation_temp"):
    relationships = mysql.getUserRelation(table)
    print len(relationships)
    # 对每一条关系插入到neo4j中
    count = 0
    for relation in relationships:
        InsertFollowsRel(relation[0], relation[1])
        count += 1
        print "insert %d relations" % count
コード例 #6
0
def GetUserInfo(userid):
    '''

    :param userid: 用户的userid
    :return: 返回TwitterUser类对象
    '''
    if mongo.CheckUser(userid) == False:
        print "数据库中不存在该用户"
        return None
    user = mysql.getUserInfo(userid)
    return user
コード例 #7
0
def InsertStandardUsers(table):
    db = Conn()
    collection = db['StandardUsers']
    users = mysql.getUsersInfo(table)
    # 开始插入
    count = 0
    for user in users:
        data = {}
        data['user_id'] = (long)(user.id)
        data['screen_name'] = user.screen_name
        data['name'] = (user.name).decode("Latin-1").encode('utf-8')
        data['location'] = (user.location).decode("Latin-1").encode('utf-8')
        data['statuses_count'] = user.statuses_count
        data['friends_count'] = user.friends_count
        data['followers_count'] = user.followers_count
        data['favourites_count'] = user.favourites_count
        data['verified'] = user.verified
        data['category'] = user.category
        data['influenceScore'] = user.influenceScore
        data['rank_influ'] = user.rank_influ
        data['psy'] = user.psy
        data['psy_seq'] = user.psy_seq
        data['psy_tweets_starttime'] = user.psy_tweets_starttime
        data['interest_tags'] = (
            user.interest_tags).decode("Latin-1").encode('utf-8')
        data['description'] = (
            user.description).decode("Latin-1").encode('utf-8')
        data['crawler_date'] = user.crawler_date
        collection.insert(data)
        count += 1
        print "insert %d users" % count
    # 建立索引
    try:
        collection.ensureIndex("user_id", unique=True)
    except Exception as e:
        print "索引建立失败"
コード例 #8
0
def getUsersByCategory(table,category):
    users = mydb.getUsersByCategory(table,category)
    return users
コード例 #9
0
def getUserInfo(id,table):
    twitter_user = mydb.getUserInfo(id,table)
    return twitter_user
コード例 #10
0
def getUsersInfo(table):
    # db = Conn(hostname,username,password,databasename)
    # cursor = db.cursor()
    user = mydb.getUsersInfo(table)
    return user
コード例 #11
0
def Accuracy(table="StandardUsers"):
    StandardUsers = mysql.getUsersInfo(table)
    categories = mysql.getCategoriesAndNumber(table)

    # 将用户的id保存
    StandardUsers_id = []

    for user in StandardUsers:
        StandardUsers_id.append(user.id)
    # ground_truth
    category_dic = GetCategoryById(StandardUsers)

    RandomForest_results, Multinomial_results, AdaBoost_results = GetClassifyResultsByWords(
        StandardUsers_id)
    save_file = open("S_results.pickle", "wb")
    pickle.dump(Multinomial_results, save_file)
    save_file.close()

    save_file = open("r_results.pickle", "wb")
    pickle.dump(RandomForest_results, save_file)
    save_file.close()

    save_file = open("a_results.pickle", "wb")
    pickle.dump(AdaBoost_results, save_file)
    save_file.close()

    # save_file = open("M_results.pickle","wb")
    # pickle.dump(MultiModels_results,save_file)
    # save_file.close()

    # open_file = open("results.pickle",'rb')
    # Multinomial_results = pickle.load(open_file)
    # MultiModels_results = Multinomial_results
    # open_file.close()

    S_Correct = calcCorrectN(Multinomial_results, category_dic) * 1.0 / len(
        category_dic.keys())
    R_Correct = calcCorrectN(RandomForest_results, category_dic) * 1.0 / len(
        category_dic.keys())
    A_Correct = calcCorrectN(AdaBoost_results, category_dic) * 1.0 / len(
        category_dic.keys())

    categories_sprecision = {}
    categories_rprecision = {}
    categories_aprecision = {}
    categories_srecall = {}
    categories_rrecall = {}
    categories_arecall = {}
    for category in categories.keys():
        # 计算在结果中共有多少该类别
        number_in_rclassify = calcCategoryN(RandomForest_results, category)
        number_in_sclassify = calcCategoryN(Multinomial_results, category)
        number_in_aclassify = calcCategoryN(AdaBoost_results, category)

        # 计算在结果中该类别中有多少正确的
        correct_number_in_sclassify = calcCategoryCorrectN(
            Multinomial_results, category, category_dic)
        correct_number_in_rclassify = calcCategoryCorrectN(
            RandomForest_results, category, category_dic)
        correct_number_in_aclassify = calcCategoryCorrectN(
            AdaBoost_results, category, category_dic)

        # 准确率
        categories_sprecision[
            category] = correct_number_in_sclassify * 1.0 / number_in_sclassify
        categories_rprecision[
            category] = correct_number_in_rclassify * 1.0 / number_in_rclassify
        categories_aprecision[
            category] = correct_number_in_aclassify * 1.0 / number_in_aclassify

        # 召回率
        categories_srecall[
            category] = correct_number_in_sclassify * 1.0 / calcCategoryN(
                category_dic, category)
        categories_rrecall[
            category] = correct_number_in_rclassify * 1.0 / calcCategoryN(
                category_dic, category)
        categories_arecall[
            category] = correct_number_in_aclassify * 1.0 / calcCategoryN(
                category_dic, category)

        # print "%s: 准确率 %f, 召回率 %f\n" % (category,categories_sprecision[category],categories_srecall[category])

    S_Precision = reduce(lambda x, y: x + y,
                         categories_sprecision.values()) / 9
    S_Recall = reduce(lambda x, y: x + y, categories_srecall.values()) / 9
    S_FScore = S_Precision * S_Recall * 2 / (S_Precision + S_Recall)

    R_Precision = reduce(lambda x, y: x + y,
                         categories_rprecision.values()) / 9
    R_Recall = reduce(lambda x, y: x + y, categories_rrecall.values()) / 9
    R_FScore = R_Precision * R_Recall * 2 / (R_Precision + R_Recall)

    A_Precision = reduce(lambda x, y: x + y,
                         categories_aprecision.values()) / 9
    A_Recall = reduce(lambda x, y: x + y, categories_arecall.values()) / 9
    A_FScore = A_Precision * A_Recall * 2 / (A_Precision + A_Recall)

    print "单模型多项式贝叶斯 精确率:%f\t平均准确率:%f\t平均召回率:%f\t平均F-Score:%f" % (
        S_Correct, S_Precision, S_Recall, S_FScore)
    print "多模型融合随机森林 精确率:%f\t平均准确率:%f\t平均召回率:%f\t平均F-Score:%f" % (
        R_Correct, R_Precision, R_Recall, R_FScore)
    print "多模型融合AdaBoost 精确率:%f\t平均准确率:%f\t平均召回率:%f\t平均F-Score:%f" % (
        A_Correct, A_Precision, A_Recall, A_FScore)
コード例 #12
0
def ProcessBio(userid, table="StandardUsers"):
    description = mysql.getUserDescription(table, userid)
    results = Generation(PreProcess(description))
    return results