def statNameScore(): gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName) gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gtsStrict twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) results = list() for gt in gts: googleId = gt[0] twitterId = gt[1] twitterName = twitterIdName[twitterId] print(googleId) print(twitterName) googleProfile = ut.readJson2Dict(interPath + "google/profile/", googleId) twitterProfile = ut.readJson2Dict(interPath + "twitter/profile/", twitterName) nameScore = ft.calNameScore(googleProfile, twitterProfile) displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile) totalScore = nameScore + displaynameScore results.append([ googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore) ]) ut.writeList2CommaLine(interPath, "name_score", results)
def structData(): # init s = time.time() usersTf1 = dict() usersTf2 = dict() usersLangDistri1 = dict() usersLangDistri2 = dict() usersSentimentScore1 = dict() usersSentimentScore2 = dict() usersTopicDistir1 = dict() usersTopicDistri2 = dict() twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName) gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gts_strict if not os.path.isdir(interPath + sn1): os.makedirs(interPath + sn1 + "/profile") os.makedirs(interPath + sn1 + "/wall") os.makedirs(interPath + sn1 + "/text") os.makedirs(interPath + sn2 + "/profile") os.makedirs(interPath + sn2 + "/wall") os.makedirs(interPath + sn2 + "/text") # norm profile and wall for gt in gts: uid1 = gt[0] uid2 = gt[1] try: if sn1 == "twitter": uid1 = twitterIdName[uid1] if sn2 == "twitter": uid2 = twitterIdName[uid2] except: continue # if not os.path.exists(interPath+sn1+"/profile/"+uid1): # norm profile and posts: google and twitter (userTf1, langDistri1, userSentimentScore1, userTopicDistri1) = structUserData(sn1, uid1) (userTf2, langDistri2, userSentimentScore2, userTopicDistri2) = structUserData(sn2, uid2) usersTf1[uid1] = userTf1 usersTf2[uid2] = userTf2 usersLangDistri1[uid1] = langDistri1 usersLangDistri2[uid2] = langDistri2 usersSentimentScore1[uid1] = userSentimentScore1 usersSentimentScore2[uid2] = userSentimentScore2 usersTopicDistir1[uid1] = userTopicDistri1 usersTopicDistri2[uid2] = userTopicDistri2 # build dictionary and idf writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistir1, usersTopicDistri2) e = time.time() print(e - s)
def structData(): # init s = time.time() usersTf1 = dict() usersTf2 = dict() usersLangDistri1 = dict() usersLangDistri2 = dict() usersSentimentScore1 = dict() usersSentimentScore2 = dict() usersTopicDistir1 = dict() usersTopicDistri2 = dict() twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName) gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gts_strict if not os.path.isdir(interPath+sn1): os.makedirs(interPath+sn1+"/profile") os.makedirs(interPath+sn1+"/wall") os.makedirs(interPath+sn1+"/text") os.makedirs(interPath+sn2+"/profile") os.makedirs(interPath+sn2+"/wall") os.makedirs(interPath+sn2+"/text") # norm profile and wall for gt in gts: uid1 = gt[0] uid2 = gt[1] try: if sn1 == "twitter": uid1 = twitterIdName[uid1] if sn2 =="twitter": uid2 = twitterIdName[uid2] except: continue # if not os.path.exists(interPath+sn1+"/profile/"+uid1): # norm profile and posts: google and twitter (userTf1, langDistri1, userSentimentScore1, userTopicDistri1) = structUserData(sn1, uid1) (userTf2, langDistri2, userSentimentScore2, userTopicDistri2) = structUserData(sn2, uid2) usersTf1[uid1] = userTf1 usersTf2[uid2] = userTf2 usersLangDistri1[uid1] = langDistri1 usersLangDistri2[uid2] = langDistri2 usersSentimentScore1[uid1] = userSentimentScore1 usersSentimentScore2[uid2] = userSentimentScore2 usersTopicDistir1[uid1] = userTopicDistri1 usersTopicDistri2[uid2] = userTopicDistri2 # build dictionary and idf writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistir1, usersTopicDistri2) e = time.time() print(e-s)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingIdLoose = list() mappingIdStrict = list() twitterNameId = dict() twitterIdName = dict() mappingLoss = list() for m in mapping: twitterUrl = m[1] twitterName = getTwitterUsername(twitterUrl) googleId = m[0] if twitterName == "": continue (google_profile_bool, google_posts_bool) = checkGoogleData(googleId) (twitter_profile_bool, twitter_posts_bool, twitter_profile) = checkTwitterData(twitterName) if google_profile_bool == False or twitter_profile_bool == False: mappingLoss.append(m) else: twitterId = twitter_profile.get("id_str", 0) if google_posts_bool == False or twitter_posts_bool == False: mappingIdLoose.append([googleId, twitterId]) else: mappingIdLoose.append([googleId, twitterId]) mappingIdStrict.append([googleId, twitterId]) twitterIdName[twitterId] = twitterName twitterNameId[twitterName] = twitterId ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose) ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName) ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
def nm(threshold=0.67): instances = ut.readCommaLine2List(outputPath, featureNmFilename) predictions = list() count = 0 for instance in instances: if (float(instance[1]) > threshold) or (float(instance[2])>threshold): count += 1 predictions.append("1") else: predictions.append("0") ut.writeList2Line(predPath, predictionNmFilename, predictions)
def statNameScore(): gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName) gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName) gts = gtsStrict twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) results = list() for gt in gts: googleId = gt[0] twitterId = gt[1] twitterName = twitterIdName[twitterId] print(googleId) print(twitterName) googleProfile = ut.readJson2Dict(interPath+"google/profile/", googleId) twitterProfile = ut.readJson2Dict(interPath+"twitter/profile/", twitterName) nameScore = ft.calNameScore(googleProfile, twitterProfile) displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile) totalScore = nameScore + displaynameScore results.append([googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore)]) ut.writeList2CommaLine(interPath, "name_score", results)
def writeMappingCandidates(): mappings = ut.readCommaLine2List(inputPath, "twitterMapping") candidates_google = list() candidates_twitter = list() for mapping in mappings: google_id = mapping[0] twitter_url = mapping[1] twitter_name = getTwitterUsername(twitter_url) if twitter_name != "": candidates_google.append(google_id) candidates_twitter.append(twitter_name) ut.writeList2Line(inputPath, "google/ids_mapping", candidates_google) ut.writeList2Line(inputPath, "twitter/names_mapping", candidates_twitter)
def buildGraphs(users1, users2): # google graph print("google graph") googleDiGraph = buildDiGraph(interPath + "google/relationship_file", users1) # twitter graph print("twitter graph") twitterDiGraph = buildDiGraph( interPath + "twitter/relationship_file_revise", users2) # full graph print("full graph") allDiGraph = nx.DiGraph() allDiGraph.add_edges_from(googleDiGraph.edges()) allDiGraph.add_edges_from(twitterDiGraph.edges()) # add mapping info gts = ut.readCommaLine2List(interPath, gtStrictFileName) for gt in gts: googleId = gt[0] twitterId = gt[1] googleDiGraph.add_node(googleId, {"twitter": twitterId}) twitterDiGraph.add_node(twitterId, {"google": googleId}) allDiGraph.add_edge(googleId, twitterId) allDiGraph.add_edge(twitterId, googleId) # build neighbor mapping list for gt in gts: googleId = gt[0] twitterId = gt[1] neighbors_g = googleDiGraph.neighbors(googleId) neighbors_t = twitterDiGraph.neighbors(twitterId) for n in neighbors_g: result = googleDiGraph.node[googleId].get("neighbor_twitter", dict()) if googleDiGraph.node[n].get("twitter", 0) != 0: result[n] = googleDiGraph.node[n]["twitter"] googleDiGraph.node[googleId]["neighbor_twitter"] = result for n in neighbors_t: result = twitterDiGraph.node[twitterId].get( "neighbor_google", dict()) if twitterDiGraph.node[n].get("google", 0) != 0: result[n] = twitterDiGraph.node[n]["google"] twitterDiGraph.node[twitterId]["neighbor_google"] = result return googleDiGraph, twitterDiGraph, allDiGraph
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingId = list() twitterNameId = dict() twitterIdName = dict() for m in mapping: twitterUrl = m[1] twitterName = twitterUrl.split("/")[-1].strip() googleId = m[0] if twitterName == "": twitterName = twitterUrl.split("/")[-2] if twitterName == "#%21" or "twitter.com" in twitterName or "twitter" == twitterName: continue # check if the google plus id is a person # read twitter profile file to check # try: # location = "../data/google/profile/"+googleId # with open(location, "r") as fi: # jresult = json.loads(fi.read()) # if jresult["objectType"]!="person": # print(googleId) # except: # pass # check if the twitter name exist try: location = inputPath + "twitter/profile/" + twitterName with open(location, "r") as fi: jresult = json.loads(fi.read()) twitterId = jresult.get("id_str", 0) if twitterId != 0: mappingId.append([googleId, twitterId]) twitterNameId[twitterName] = twitterId twitterIdName[twitterId] = twitterName except: pass ut.writeList2CommaLine(interPath, "gt", mappingId) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
def getGroundTruth(): mapping = ut.readCommaLine2List(inputPath, mappingFileName) mappingId = list() twitterNameId = dict() twitterIdName = dict() for m in mapping: twitterUrl = m[1] twitterName = twitterUrl.split("/")[-1].strip() googleId = m[0] if twitterName=="": twitterName = twitterUrl.split("/")[-2] if twitterName=="#%21" or "twitter.com" in twitterName or "twitter" == twitterName: continue # check if the google plus id is a person # read twitter profile file to check # try: # location = "../data/google/profile/"+googleId # with open(location, "r") as fi: # jresult = json.loads(fi.read()) # if jresult["objectType"]!="person": # print(googleId) # except: # pass # check if the twitter name exist try: location = inputPath+"twitter/profile/"+twitterName with open(location, "r") as fi: jresult = json.loads(fi.read()) twitterId = jresult.get("id_str", 0) if twitterId != 0: mappingId.append([googleId, twitterId]) twitterNameId[twitterName] = twitterId twitterIdName[twitterId] = twitterName except: pass ut.writeList2CommaLine(interPath, "gt", mappingId) ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId) ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
def createSNMapping(): path = "../data/" snLists = ut.readCommaLine2List(path, snFile) print(len(snLists)) fbMapping = list() twitterMapping = list() youtubeMapping = list() googleMapping = list() for snList in snLists: uid = snList[0] if snList[1] != "": youtubeMapping.append([snList[0], snList[1]]) if snList[2] != "": fbMapping.append([snList[0], snList[2]]) if snList[3] != "": twitterMapping.append([snList[0], snList[3]]) # if "plus.google" in snList[-1]: # googleMapping.append([snList]) print(len(twitterMapping)) ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping) ut.writeList2CommaLine("../data", "fbMapping", fbMapping) ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)
def createSNMapping(): path = "../data/" snLists = ut.readCommaLine2List(path, snFile) print(len(snLists)) fbMapping = list() twitterMapping = list() youtubeMapping = list() googleMapping = list() for snList in snLists: uid = snList[0] if snList[1] != "": youtubeMapping.append([snList[0],snList[1]]) if snList[2] != "": fbMapping.append([snList[0],snList[2]]) if snList[3] != "": twitterMapping.append([snList[0],snList[3]]) # if "plus.google" in snList[-1]: # googleMapping.append([snList]) print(len(twitterMapping)) ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping) ut.writeList2CommaLine("../data", "fbMapping", fbMapping) ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)
def buildGraphs(users1, users2): # google graph print("google graph") googleDiGraph = buildDiGraph(interPath+"google/relationship_file", users1) # twitter graph print("twitter graph") twitterDiGraph = buildDiGraph(interPath+"twitter/relationship_file_revise", users2) # full graph print("full graph") allDiGraph = nx.DiGraph() allDiGraph.add_edges_from(googleDiGraph.edges()) allDiGraph.add_edges_from(twitterDiGraph.edges()) # add mapping info gts = ut.readCommaLine2List(interPath, gtStrictFileName) for gt in gts: googleId = gt[0] twitterId = gt[1] googleDiGraph.add_node(googleId, {"twitter":twitterId}) twitterDiGraph.add_node(twitterId, {"google":googleId}) allDiGraph.add_edge(googleId, twitterId) allDiGraph.add_edge(twitterId, googleId) # build neighbor mapping list for gt in gts: googleId = gt[0] twitterId = gt[1] neighbors_g = googleDiGraph.neighbors(googleId) neighbors_t = twitterDiGraph.neighbors(twitterId) for n in neighbors_g: result = googleDiGraph.node[googleId].get("neighbor_twitter", dict()) if googleDiGraph.node[n].get("twitter", 0)!=0: result[n] = googleDiGraph.node[n]["twitter"] googleDiGraph.node[googleId]["neighbor_twitter"] = result for n in neighbors_t: result = twitterDiGraph.node[twitterId].get("neighbor_google", dict()) if twitterDiGraph.node[n].get("google", 0) !=0: result[n] = twitterDiGraph.node[n]["google"] twitterDiGraph.node[twitterId]["neighbor_google"] = result return googleDiGraph, twitterDiGraph, allDiGraph
def getUsersFeatures(procNum=10): # init user pair by mapping gts = ut.readCommaLine2List(interPath, gtStrictFileName) sn1 = "google" sn2 = "twitter" users_sn1 = list() users_sn2 = list() # scoresMatrix = lil_matrix((len(gts), len(gts))) scoresMatrix = dict() for gt in gts: users_sn1.append(gt[0]) users_sn2.append(gt[1]) # build graph print("build graph") s = time.time() g1, g2, g0 = buildGraphs(users_sn1, users_sn2) e = time.time() print("build graph over cost: " + str(e - s)) # for profile using print("popular count") s = time.time() writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2) e = time.time() print("popular count over cost: " + str(e - s)) print("calculate features start") # calculate features s = time.time() pairs = [(a, b) for a in range(len(gts)) for b in range(len(gts)) if b >= a] twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData( users_sn1, users_sn2, twitterIdName) # # for pair in pairs: # print(pair) # scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0) # scoresMatrix[(pair[0], pair[1])] = scores # parallel batchNum = round(len(pairs) / procNum) procs = list() q = mp.Queue() for i in range(procNum): batchPairs = list() if i == procNum - 1: batchPairs = pairs[i * batchNum:] else: batchPairs = pairs[i * batchNum:(i + 1) * batchNum] # p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q)) p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q, profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter)) p.start() procs.append(p) print("update start") for i in range(len(pairs)): print(i) result = q.get() # scoresMatrix.update(result) scoresMatrix[result["key"]] = result["value"] print("update over") print(len(scoresMatrix)) for proc in procs: proc.join() # output feature with open(outputPath + featureFileName, "w") as fo: for i in range(len(gts)): for j in range(len(gts)): if i == j: rank = 1 else: rank = 0 if i > j: scores = scoresMatrix[(j, i)] else: scores = scoresMatrix[(i, j)] outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) fo.write(outputStr) # with open(outputPath+featureFileName, "w") as fo: # for i in range(len(gts)): # print(users_sn1[i]) # print(i) # for j in range(len(gts)): # print(j) # if i == j: # rank = 1 # else: # rank = 0 # scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0) # outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) # fo.write(outputStr) e = time.time() print("write feature costs:" + str(e - s))
def getUsersFeatures(procNum = 10): # init user pair by mapping gts = ut.readCommaLine2List(interPath, gtStrictFileName) sn1 = "google" sn2 = "twitter" users_sn1 = list() users_sn2 = list() # scoresMatrix = lil_matrix((len(gts), len(gts))) scoresMatrix = dict() for gt in gts: users_sn1.append(gt[0]) users_sn2.append(gt[1]) # build graph print("build graph") s = time.time() g1, g2, g0 = buildGraphs(users_sn1, users_sn2) e = time.time() print("build graph over cost: "+str(e-s)) # for profile using print("popular count") s = time.time() writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2) e = time.time() print("popular count over cost: "+str(e-s)) print("calculate features start") # calculate features s = time.time() pairs = [(a,b) for a in range(len(gts)) for b in range(len(gts)) if b>=a] twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName) twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName) profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData(users_sn1, users_sn2, twitterIdName) # # for pair in pairs: # print(pair) # scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0) # scoresMatrix[(pair[0], pair[1])] = scores # parallel batchNum = round(len(pairs)/procNum) procs = list() q = mp.Queue() for i in range(procNum): batchPairs = list() if i == procNum-1: batchPairs = pairs[i*batchNum:] else: batchPairs = pairs[i*batchNum:(i+1)*batchNum] # p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q)) p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q, profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter)) p.start() procs.append(p) print("update start") for i in range(len(pairs)): print(i) result = q.get() # scoresMatrix.update(result) scoresMatrix[result["key"]] = result["value"] print("update over") print(len(scoresMatrix)) for proc in procs: proc.join() # output feature with open(outputPath+featureFileName, "w") as fo: for i in range(len(gts)): for j in range(len(gts)): if i == j: rank = 1 else: rank = 0 if i > j: scores = scoresMatrix[(j, i)] else: scores = scoresMatrix[(i, j)] outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) fo.write(outputStr) # with open(outputPath+featureFileName, "w") as fo: # for i in range(len(gts)): # print(users_sn1[i]) # print(i) # for j in range(len(gts)): # print(j) # if i == j: # rank = 1 # else: # rank = 0 # scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0) # outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores) # fo.write(outputStr) e = time.time() print("write feature costs:" + str(e-s))