def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingIdLoose = list() mappingIdStrict = list() twitterNameId = dict() twitterIdName = dict() mappingLoss = list() for m in mapping: twitterUrl = m[1] twitterName = getTwitterUsername(twitterUrl) googleId = m[0] if twitterName == "": continue (google_profile_bool, google_posts_bool) = checkGoogleData(googleId) (twitter_profile_bool, twitter_posts_bool, twitter_profile) = checkTwitterData(twitterName) if google_profile_bool == False or twitter_profile_bool == False: mappingLoss.append(m) else: twitterId = twitter_profile.get("id_str", 0) if google_posts_bool == False or twitter_posts_bool == False: mappingIdLoose.append([googleId, twitterId]) else: mappingIdLoose.append([googleId, twitterId]) mappingIdStrict.append([googleId, twitterId]) twitterIdName[twitterId] = twitterName twitterNameId[twitterName] = twitterId ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose) ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName) ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
def structUserData(sn, uid): print(uid) # norm profile profile = ut.readJson2Dict(inputPath+sn+"/profile/", uid) posts = ut.readJson2Dict(inputPath+sn+"/wall/", uid) print("profile:"+interPath+sn+"/profile/"+uid) newProfile = normProfile(sn, profile) print("wall:"+interPath+sn+"/wall/"+uid) newPosts = normWall(sn, posts) ut.writeDict2Json(interPath+sn+"/profile/", uid, newProfile) ut.writeDict2Json(interPath+sn+"/wall/", uid, newPosts) # wall statisitcs langDistri = ut.getDistri([post["lang"] for post in newPosts]) # sentiment sum sentiments = [post["sentiment"]["polarity"] for post in newPosts] sentiment_score = sum(sentiments)/len(sentiments) if len(sentiments)>0 else 0 # topic sum topicDistris = [post["topic_distri"] for post in newPosts] userTopicDistri = ut.mergeDict(topicDistris) userTopicDistri = ut.normVector(userTopicDistri) # tf tfs = [post["tf"] for post in newPosts] userTf = ut.mergeDict(tfs) return (userTf, langDistri, sentiment_score, userTopicDistri)
def structUserData(sn, uid): print(uid) # norm profile profile = ut.readJson2Dict(inputPath + sn + "/profile/", uid) posts = ut.readJson2Dict(inputPath + sn + "/wall/", uid) print("profile:" + interPath + sn + "/profile/" + uid) newProfile = normProfile(sn, profile) print("wall:" + interPath + sn + "/wall/" + uid) newPosts = normWall(sn, posts) ut.writeDict2Json(interPath + sn + "/profile/", uid, newProfile) ut.writeDict2Json(interPath + sn + "/wall/", uid, newPosts) # wall statisitcs langDistri = ut.getDistri([post["lang"] for post in newPosts]) # sentiment sum sentiments = [post["sentiment"]["polarity"] for post in newPosts] sentiment_score = sum(sentiments) / len(sentiments) if len( sentiments) > 0 else 0 # topic sum topicDistris = [post["topic_distri"] for post in newPosts] userTopicDistri = ut.mergeDict(topicDistris) userTopicDistri = ut.normVector(userTopicDistri) # tf tfs = [post["tf"] for post in newPosts] userTf = ut.mergeDict(tfs) return (userTf, langDistri, sentiment_score, userTopicDistri)
def writeTextStat(usersTf, usersLangDistri, idf, sn, usersSentimentScore, usersTopicDistri): for user, tf in usersTf.items(): tf_top5 = list() tfidf_top5 = list() result = dict() norm = float() result["tf_top5"] = sorted(tf.items(), key=operator.itemgetter(1), reverse=True)[:5] for term, fre in tf.items(): usersTf[user][term] = fre * idf[term] norm += math.pow(usersTf[user][term], 2) # unit vector norm = math.sqrt(norm) for term in tf.keys(): usersTf[user][term] = usersTf[user][term] / norm result["tfidf_top5"] = sorted(usersTf[user].items(), key=operator.itemgetter(1), reverse=True)[:5] result["tfidf"] = usersTf[user] result["lang_distri"] = usersLangDistri[user] if len(result["lang_distri"]) > 0: result["lang"] = max(usersLangDistri[user].items(), key=operator.itemgetter(1))[0] else: result["lang"] = "none" result["sentiment"] = usersSentimentScore[user] result["topic_distri"] = usersTopicDistri[user] ut.writeDict2Json(interPath + sn + "/text", user, result)
def writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2): in_degree_sn1 = list() in_degree_sn2 = list() for user in users_sn1: in_degree_sn1.append(g1.in_degree(user)) for user in users_sn2: in_degree_sn2.append(g2.in_degree(user)) result = {sn1:max(in_degree_sn1), sn2:max(in_degree_sn2)} ut.writeDict2Json(interPath, popularCountFileName, result)
def writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2): in_degree_sn1 = list() in_degree_sn2 = list() for user in users_sn1: in_degree_sn1.append(g1.in_degree(user)) for user in users_sn2: in_degree_sn2.append(g2.in_degree(user)) result = {sn1: max(in_degree_sn1), sn2: max(in_degree_sn2)} ut.writeDict2Json(interPath, popularCountFileName, result)
def writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistri1, usersTopicDistri2): # build dictionary and idf idf = dict() for user, tf in usersTf1.items(): for term in tf: idf[term] = idf.get(term, 0) +1 for user, tf in usersTf2.items(): for term in tf: idf[term] = idf.get(term, 0) +1 n = len(usersTf1) * 2 for term, df in idf.items(): idf[term] = math.log(n/df) # write dictionary ut.writeDict2Json(interPath, "idf.json", idf) ut.writeList2Json(interPath, "dictionary.txt", sorted(idf.keys())) # write unit vector writeTextStat(usersTf1, usersLangDistri1, idf, sn1, usersSentimentScore1, usersTopicDistri1) writeTextStat(usersTf2, usersLangDistri2, idf, sn2, usersSentimentScore2, usersTopicDistri2)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingId = list() twitterNameId = dict() twitterIdName = dict() for m in mapping: twitterUrl = m[1] twitterName = twitterUrl.split("/")[-1].strip() googleId = m[0] if twitterName=="": twitterName = twitterUrl.split("/")[-2] if twitterName=="#%21" or "twitter.com" in twitterName or "twitter" == twitterName: continue # check if the google plus id is a person # read twitter profile file to check # try: # location = "../data/google/profile/"+googleId # with open(location, "r") as fi: # jresult = json.loads(fi.read()) # if jresult["objectType"]!="person": # print(googleId) # except: # pass # check if the twitter name exist try: location = inputPath+"twitter/profile/"+twitterName with open(location, "r") as fi: jresult = json.loads(fi.read()) twitterId = jresult.get("id_str", 0) if twitterId != 0: mappingId.append([googleId, twitterId]) twitterNameId[twitterName] = twitterId twitterIdName[twitterId] = twitterName except: pass ut.writeList2CommaLine(interPath, "gt", mappingId) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingId = list() twitterNameId = dict() twitterIdName = dict() for m in mapping: twitterUrl = m[1] twitterName = twitterUrl.split("/")[-1].strip() googleId = m[0] if twitterName == "": twitterName = twitterUrl.split("/")[-2] if twitterName == "#%21" or "twitter.com" in twitterName or "twitter" == twitterName: continue # check if the google plus id is a person # read twitter profile file to check # try: # location = "../data/google/profile/"+googleId # with open(location, "r") as fi: # jresult = json.loads(fi.read()) # if jresult["objectType"]!="person": # print(googleId) # except: # pass # check if the twitter name exist try: location = inputPath + "twitter/profile/" + twitterName with open(location, "r") as fi: jresult = json.loads(fi.read()) twitterId = jresult.get("id_str", 0) if twitterId != 0: mappingId.append([googleId, twitterId]) twitterNameId[twitterName] = twitterId twitterIdName[twitterId] = twitterName except: pass ut.writeList2CommaLine(interPath, "gt", mappingId) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
def writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistri1, usersTopicDistri2): # build dictionary and idf idf = dict() for user, tf in usersTf1.items(): for term in tf: idf[term] = idf.get(term, 0) + 1 for user, tf in usersTf2.items(): for term in tf: idf[term] = idf.get(term, 0) + 1 n = len(usersTf1) * 2 for term, df in idf.items(): idf[term] = math.log(n / df) # write dictionary ut.writeDict2Json(interPath, "idf.json", idf) ut.writeList2Json(interPath, "dictionary.txt", sorted(idf.keys())) # write unit vector writeTextStat(usersTf1, usersLangDistri1, idf, sn1, usersSentimentScore1, usersTopicDistri1) writeTextStat(usersTf2, usersLangDistri2, idf, sn2, usersSentimentScore2, usersTopicDistri2)
def writeTextStat(usersTf, usersLangDistri, idf, sn, usersSentimentScore, usersTopicDistri): for user, tf in usersTf.items(): tf_top5 = list() tfidf_top5 = list() result = dict() norm = float() result["tf_top5"] = sorted(tf.items(), key=operator.itemgetter(1), reverse=True)[:5] for term, fre in tf.items(): usersTf[user][term] = fre * idf[term] norm += math.pow(usersTf[user][term], 2) # unit vector norm = math.sqrt(norm) for term in tf.keys(): usersTf[user][term] = usersTf[user][term]/norm result["tfidf_top5"] = sorted(usersTf[user].items(), key=operator.itemgetter(1), reverse=True)[:5] result["tfidf"] = usersTf[user] result["lang_distri"] = usersLangDistri[user] if len(result["lang_distri"])>0: result["lang"] = max(usersLangDistri[user].items(), key=operator.itemgetter(1))[0] else: result["lang"] = "none" result["sentiment"] = usersSentimentScore[user] result["topic_distri"] = usersTopicDistri[user] ut.writeDict2Json(interPath+sn+"/text",user,result)