def CalculateTFIDF(usercandidate, user_id, table="StandardUsers"): # 获取用户的所有id userids = mysql.getUsersId(table) userNumber = len(userids) print "正在计算TFIDF,数据库中共计%d个用户" % (userNumber) # print "该用户有%d个用户,现在开始计算" % (userNumber - 1) tfidf = [1 for i in range(50)] # for i in range(100): # tfidf.append(1) count = 0 for userid in userids: if userid == user_id: continue # lines = f.readlines() wordsSet = GenerateWords(userid) id = 0 for candidate in usercandidate: # 全部转换成小写 if (candidate[0]).lower() in wordsSet: tfidf[id] += 1 # except Exception as e: # pass id += 1 count += 1 print count id = 0 for uc in usercandidate: value = math.log(userNumber * 1.0 / tfidf[id]) * uc[1] tfidf[id] = value id += 1 # tfidf = map(lambda value:math.log(value * 1.0 / userNumber) * usercandidate[key],tfidf) return tfidf
def GenerateAllUsersInterestTags(table="StandardUsers"): users = mysql.getUsersInfo(table) count = 0 loss = 0 for user in users: try: interests = GenerateInterestsWithFollowers(user.id) print "%s:" % user.id print interests # 写入数据库中 mysql.updateUserInterest(table, user.id, interests.encode('utf-8')) except Exception as e: loss += 1 print "lose userid:%s" % user.id print "loss %d users" % loss count += 1 print "finished %d users" % count
def getUsers(table): ''' :param table: 表名 :return: 返回TwitterUser类对象的列表 ''' users = mysql.getUsersInfo(table) return users
def UserProfileFromDB(userid): start_time = time.time() if mysql.checkUser(userid) == False: print "数据库中不存在该用户" return user = mysql.getUserInfo(userid) # 获取用户推文文本 tweets = mongo.getUserTweets(userid) if tweets == "": print "mongodb中没有该用户的推文" return print "已获取推文" # 获取人物所属领域 category = GetUserCategory(userid) user.category = category print "人物所属领域:%s" % category # 获取人物兴趣爱好标签,两种方式 # interests = ExtractTargetUserInterest.GenerateInterestsWithFollowers(userid) interests = GetUserInterestTags(userid) user.interest_tags = interests print "人物兴趣爱好标签:%s" % interests # 获取人物影响力分数及等级 # rank为{1,2,3}集合中的某一元素 influence_score,active,influ,rank = GetUserInfluence(userid) user.influenceScore = influence_score print "人物活跃度分数:%f,影响力度分数:%f,影响力分数:%f,影响力等级:%s" % (active,influ,influence_score,rank) # 获取人物心理状态,返回结果为最近一条推文起始时间,从起始时间向前一段时间内的心理状态序列以及近期心理状态结果,psy为{1,-1,0} # 后面后可以跟参数period,设置时间段的长度,单位为月 starttime,psychological,psy = GetUserPsychology(userid) user.psy_tweets_starttime = starttime user.psy = psy user.psy_seq = psychological # psy从整形转为字符串型 psy = config.psychological[psy] print "人物近期心理状态:%s" % psy # 生成XML文档 xml.GenerateUserXml(user) end_time = time.time() print "用时:%f" % (end_time - start_time)
def InsertRelsToNeoFromMysql(table="relation_temp"): relationships = mysql.getUserRelation(table) print len(relationships) # 对每一条关系插入到neo4j中 count = 0 for relation in relationships: InsertFollowsRel(relation[0], relation[1]) count += 1 print "insert %d relations" % count
def GetUserInfo(userid): ''' :param userid: 用户的userid :return: 返回TwitterUser类对象 ''' if mongo.CheckUser(userid) == False: print "数据库中不存在该用户" return None user = mysql.getUserInfo(userid) return user
def InsertStandardUsers(table): db = Conn() collection = db['StandardUsers'] users = mysql.getUsersInfo(table) # 开始插入 count = 0 for user in users: data = {} data['user_id'] = (long)(user.id) data['screen_name'] = user.screen_name data['name'] = (user.name).decode("Latin-1").encode('utf-8') data['location'] = (user.location).decode("Latin-1").encode('utf-8') data['statuses_count'] = user.statuses_count data['friends_count'] = user.friends_count data['followers_count'] = user.followers_count data['favourites_count'] = user.favourites_count data['verified'] = user.verified data['category'] = user.category data['influenceScore'] = user.influenceScore data['rank_influ'] = user.rank_influ data['psy'] = user.psy data['psy_seq'] = user.psy_seq data['psy_tweets_starttime'] = user.psy_tweets_starttime data['interest_tags'] = ( user.interest_tags).decode("Latin-1").encode('utf-8') data['description'] = ( user.description).decode("Latin-1").encode('utf-8') data['crawler_date'] = user.crawler_date collection.insert(data) count += 1 print "insert %d users" % count # 建立索引 try: collection.ensureIndex("user_id", unique=True) except Exception as e: print "索引建立失败"
def getUsersByCategory(table,category): users = mydb.getUsersByCategory(table,category) return users
def getUserInfo(id,table): twitter_user = mydb.getUserInfo(id,table) return twitter_user
def getUsersInfo(table): # db = Conn(hostname,username,password,databasename) # cursor = db.cursor() user = mydb.getUsersInfo(table) return user
def Accuracy(table="StandardUsers"): StandardUsers = mysql.getUsersInfo(table) categories = mysql.getCategoriesAndNumber(table) # 将用户的id保存 StandardUsers_id = [] for user in StandardUsers: StandardUsers_id.append(user.id) # ground_truth category_dic = GetCategoryById(StandardUsers) RandomForest_results, Multinomial_results, AdaBoost_results = GetClassifyResultsByWords( StandardUsers_id) save_file = open("S_results.pickle", "wb") pickle.dump(Multinomial_results, save_file) save_file.close() save_file = open("r_results.pickle", "wb") pickle.dump(RandomForest_results, save_file) save_file.close() save_file = open("a_results.pickle", "wb") pickle.dump(AdaBoost_results, save_file) save_file.close() # save_file = open("M_results.pickle","wb") # pickle.dump(MultiModels_results,save_file) # save_file.close() # open_file = open("results.pickle",'rb') # Multinomial_results = pickle.load(open_file) # MultiModels_results = Multinomial_results # open_file.close() S_Correct = calcCorrectN(Multinomial_results, category_dic) * 1.0 / len( category_dic.keys()) R_Correct = calcCorrectN(RandomForest_results, category_dic) * 1.0 / len( category_dic.keys()) A_Correct = calcCorrectN(AdaBoost_results, category_dic) * 1.0 / len( category_dic.keys()) categories_sprecision = {} categories_rprecision = {} categories_aprecision = {} categories_srecall = {} categories_rrecall = {} categories_arecall = {} for category in categories.keys(): # 计算在结果中共有多少该类别 number_in_rclassify = calcCategoryN(RandomForest_results, category) number_in_sclassify = calcCategoryN(Multinomial_results, category) number_in_aclassify = calcCategoryN(AdaBoost_results, category) # 计算在结果中该类别中有多少正确的 correct_number_in_sclassify = calcCategoryCorrectN( Multinomial_results, category, category_dic) correct_number_in_rclassify = calcCategoryCorrectN( RandomForest_results, category, category_dic) correct_number_in_aclassify = calcCategoryCorrectN( AdaBoost_results, category, category_dic) # 准确率 categories_sprecision[ category] = correct_number_in_sclassify * 1.0 / number_in_sclassify categories_rprecision[ category] = correct_number_in_rclassify * 1.0 / number_in_rclassify categories_aprecision[ category] = correct_number_in_aclassify * 1.0 / number_in_aclassify # 召回率 categories_srecall[ category] = correct_number_in_sclassify * 1.0 / calcCategoryN( category_dic, category) categories_rrecall[ category] = correct_number_in_rclassify * 1.0 / calcCategoryN( category_dic, category) categories_arecall[ category] = correct_number_in_aclassify * 1.0 / calcCategoryN( category_dic, category) # print "%s: 准确率 %f, 召回率 %f\n" % (category,categories_sprecision[category],categories_srecall[category]) S_Precision = reduce(lambda x, y: x + y, categories_sprecision.values()) / 9 S_Recall = reduce(lambda x, y: x + y, categories_srecall.values()) / 9 S_FScore = S_Precision * S_Recall * 2 / (S_Precision + S_Recall) R_Precision = reduce(lambda x, y: x + y, categories_rprecision.values()) / 9 R_Recall = reduce(lambda x, y: x + y, categories_rrecall.values()) / 9 R_FScore = R_Precision * R_Recall * 2 / (R_Precision + R_Recall) A_Precision = reduce(lambda x, y: x + y, categories_aprecision.values()) / 9 A_Recall = reduce(lambda x, y: x + y, categories_arecall.values()) / 9 A_FScore = A_Precision * A_Recall * 2 / (A_Precision + A_Recall) print "单模型多项式贝叶斯 精确率:%f\t平均准确率:%f\t平均召回率:%f\t平均F-Score:%f" % ( S_Correct, S_Precision, S_Recall, S_FScore) print "多模型融合随机森林 精确率:%f\t平均准确率:%f\t平均召回率:%f\t平均F-Score:%f" % ( R_Correct, R_Precision, R_Recall, R_FScore) print "多模型融合AdaBoost 精确率:%f\t平均准确率:%f\t平均召回率:%f\t平均F-Score:%f" % ( A_Correct, A_Precision, A_Recall, A_FScore)
def ProcessBio(userid, table="StandardUsers"): description = mysql.getUserDescription(table, userid) results = Generation(PreProcess(description)) return results