Esempi in Python per readCommaLine2List, esempi in Python per utility.readCommaLine2List

Esempio n. 1

0

Mostra file

def statNameScore():
    gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName)
    gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName)
    gts = gtsStrict
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)
    results = list()

    for gt in gts:
        googleId = gt[0]
        twitterId = gt[1]
        twitterName = twitterIdName[twitterId]
        print(googleId)
        print(twitterName)
        googleProfile = ut.readJson2Dict(interPath + "google/profile/",
                                         googleId)
        twitterProfile = ut.readJson2Dict(interPath + "twitter/profile/",
                                          twitterName)
        nameScore = ft.calNameScore(googleProfile, twitterProfile)
        displaynameScore = ft.calDisplayNameScore(googleProfile,
                                                  twitterProfile)
        totalScore = nameScore + displaynameScore
        results.append([
            googleId, twitterId,
            str(nameScore),
            str(displaynameScore),
            str(totalScore)
        ])
    ut.writeList2CommaLine(interPath, "name_score", results)

Esempio n. 2

0

Mostra file

File: preprocess.py Progetto: sychen1121/sn_crawler

def structData():
    # init
    s = time.time()
    usersTf1 = dict()
    usersTf2 = dict()
    usersLangDistri1 = dict()
    usersLangDistri2 = dict()
    usersSentimentScore1 = dict()
    usersSentimentScore2 = dict()
    usersTopicDistir1 = dict()
    usersTopicDistri2 = dict()
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName)
    gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName)
    gts = gts_strict
    if not os.path.isdir(interPath + sn1):
        os.makedirs(interPath + sn1 + "/profile")
        os.makedirs(interPath + sn1 + "/wall")
        os.makedirs(interPath + sn1 + "/text")
        os.makedirs(interPath + sn2 + "/profile")
        os.makedirs(interPath + sn2 + "/wall")
        os.makedirs(interPath + sn2 + "/text")
    # norm profile and wall
    for gt in gts:
        uid1 = gt[0]
        uid2 = gt[1]
        try:
            if sn1 == "twitter":
                uid1 = twitterIdName[uid1]
            if sn2 == "twitter":
                uid2 = twitterIdName[uid2]
        except:
            continue
        # if not os.path.exists(interPath+sn1+"/profile/"+uid1):
        # norm profile and posts: google and twitter
        (userTf1, langDistri1, userSentimentScore1,
         userTopicDistri1) = structUserData(sn1, uid1)
        (userTf2, langDistri2, userSentimentScore2,
         userTopicDistri2) = structUserData(sn2, uid2)
        usersTf1[uid1] = userTf1
        usersTf2[uid2] = userTf2
        usersLangDistri1[uid1] = langDistri1
        usersLangDistri2[uid2] = langDistri2
        usersSentimentScore1[uid1] = userSentimentScore1
        usersSentimentScore2[uid2] = userSentimentScore2
        usersTopicDistir1[uid1] = userTopicDistri1
        usersTopicDistri2[uid2] = userTopicDistri2
    # build dictionary and idf
    writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2,
                   usersSentimentScore1, usersSentimentScore2,
                   usersTopicDistir1, usersTopicDistri2)
    e = time.time()
    print(e - s)

Esempio n. 3

0

Mostra file

File: preprocess.py Progetto: imsorry1121/sn_crawler

def structData():
	# init
	s = time.time()
	usersTf1 = dict()
	usersTf2 = dict()
	usersLangDistri1 = dict()
	usersLangDistri2 = dict()
	usersSentimentScore1 = dict()
	usersSentimentScore2 = dict()
	usersTopicDistir1 = dict()
	usersTopicDistri2 = dict()
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	gts_loose = ut.readCommaLine2List(interPath, gtLooseFileName)
	gts_strict = ut.readCommaLine2List(interPath, gtStrictFileName)
	gts = gts_strict
	if not os.path.isdir(interPath+sn1):
		os.makedirs(interPath+sn1+"/profile")
		os.makedirs(interPath+sn1+"/wall")
		os.makedirs(interPath+sn1+"/text")
		os.makedirs(interPath+sn2+"/profile")
		os.makedirs(interPath+sn2+"/wall")
		os.makedirs(interPath+sn2+"/text")
	# norm profile and wall
	for gt in gts:
		uid1 = gt[0]
		uid2 = gt[1]
		try:
			if sn1 == "twitter":
				uid1 = twitterIdName[uid1]
			if sn2 =="twitter":
				uid2 = twitterIdName[uid2]
		except:
			continue
		# if not os.path.exists(interPath+sn1+"/profile/"+uid1):
			# norm profile and posts: google and twitter
		(userTf1, langDistri1, userSentimentScore1, userTopicDistri1) = structUserData(sn1, uid1)
		(userTf2, langDistri2, userSentimentScore2, userTopicDistri2) = structUserData(sn2, uid2)
		usersTf1[uid1] = userTf1 
		usersTf2[uid2] = userTf2
		usersLangDistri1[uid1] = langDistri1
		usersLangDistri2[uid2] = langDistri2
		usersSentimentScore1[uid1] = userSentimentScore1
		usersSentimentScore2[uid2] = userSentimentScore2
		usersTopicDistir1[uid1] = userTopicDistri1
		usersTopicDistri2[uid2] = userTopicDistri2
	# build dictionary and idf
	writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistir1, usersTopicDistri2)
	e = time.time()
	print(e-s)

Esempio n. 4

0

Mostra file

File: preprocess.py Progetto: imsorry1121/sn_crawler

def getGroundTruth():
	mapping = ut.readCommaLine2List(inputPath, mappingFileName)
	mappingIdLoose = list()
	mappingIdStrict = list()
	twitterNameId = dict()
	twitterIdName = dict()
	mappingLoss = list()
	for m in mapping:
		twitterUrl = m[1]
		twitterName = getTwitterUsername(twitterUrl)
		googleId = m[0]

		if twitterName == "":
			continue
		(google_profile_bool, google_posts_bool) = checkGoogleData(googleId)
		(twitter_profile_bool, twitter_posts_bool, twitter_profile) = checkTwitterData(twitterName)

		if google_profile_bool == False or twitter_profile_bool == False:
			mappingLoss.append(m)
		else:
			twitterId = twitter_profile.get("id_str", 0)
			if google_posts_bool == False or twitter_posts_bool == False:
				mappingIdLoose.append([googleId, twitterId])
			else:
				mappingIdLoose.append([googleId, twitterId])
				mappingIdStrict.append([googleId, twitterId])
			twitterIdName[twitterId] = twitterName
			twitterNameId[twitterName] = twitterId
	ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose)
	ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict)
	ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
	ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
	ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)

Esempio n. 5

0

Mostra file

File: preprocess.py Progetto: sychen1121/sn_crawler

def getGroundTruth():
    mapping = ut.readCommaLine2List(inputPath, mappingFileName)
    mappingIdLoose = list()
    mappingIdStrict = list()
    twitterNameId = dict()
    twitterIdName = dict()
    mappingLoss = list()
    for m in mapping:
        twitterUrl = m[1]
        twitterName = getTwitterUsername(twitterUrl)
        googleId = m[0]

        if twitterName == "":
            continue
        (google_profile_bool, google_posts_bool) = checkGoogleData(googleId)
        (twitter_profile_bool, twitter_posts_bool,
         twitter_profile) = checkTwitterData(twitterName)

        if google_profile_bool == False or twitter_profile_bool == False:
            mappingLoss.append(m)
        else:
            twitterId = twitter_profile.get("id_str", 0)
            if google_posts_bool == False or twitter_posts_bool == False:
                mappingIdLoose.append([googleId, twitterId])
            else:
                mappingIdLoose.append([googleId, twitterId])
                mappingIdStrict.append([googleId, twitterId])
            twitterIdName[twitterId] = twitterName
            twitterNameId[twitterName] = twitterId
    ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose)
    ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict)
    ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
    ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
    ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)

Esempio n. 6

0

Mostra file

File: model.py Progetto: sychen1121/sn_crawler

def nm(threshold=0.67):
	instances = ut.readCommaLine2List(outputPath, featureNmFilename)
	predictions = list()
	count = 0
	for instance in instances:
		if (float(instance[1]) > threshold) or (float(instance[2])>threshold):
			count += 1
			predictions.append("1")
		else:
			predictions.append("0")
	ut.writeList2Line(predPath, predictionNmFilename, predictions)

Esempio n. 7

0

Mostra file

File: stat.py Progetto: imsorry1121/sn_crawler

def statNameScore():
	gtsLoose = ut.readCommaLine2List(interPath, gtLooseFileName)
	gtsStrict = ut.readCommaLine2List(interPath, gtStrictFileName)
	gts = gtsStrict
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)
	results = list()

	for gt in gts:
		googleId = gt[0]
		twitterId = gt[1]
		twitterName = twitterIdName[twitterId]
		print(googleId)
		print(twitterName)
		googleProfile = ut.readJson2Dict(interPath+"google/profile/", googleId)
		twitterProfile = ut.readJson2Dict(interPath+"twitter/profile/", twitterName)
		nameScore = ft.calNameScore(googleProfile, twitterProfile)
		displaynameScore = ft.calDisplayNameScore(googleProfile, twitterProfile)
		totalScore = nameScore + displaynameScore
		results.append([googleId, twitterId, str(nameScore), str(displaynameScore), str(totalScore)])
	ut.writeList2CommaLine(interPath, "name_score", results)

Esempio n. 8

0

Mostra file

def writeMappingCandidates():
    mappings = ut.readCommaLine2List(inputPath, "twitterMapping")
    candidates_google = list()
    candidates_twitter = list()
    for mapping in mappings:
        google_id = mapping[0]
        twitter_url = mapping[1]
        twitter_name = getTwitterUsername(twitter_url)
        if twitter_name != "":
            candidates_google.append(google_id)
            candidates_twitter.append(twitter_name)
    ut.writeList2Line(inputPath, "google/ids_mapping", candidates_google)
    ut.writeList2Line(inputPath, "twitter/names_mapping", candidates_twitter)

Esempio n. 9

0

Mostra file

File: process.py Progetto: imsorry1121/sn_crawler

def writeMappingCandidates():
	mappings = ut.readCommaLine2List(inputPath, "twitterMapping")
	candidates_google = list()
	candidates_twitter = list()
	for mapping in mappings:
		google_id = mapping[0]
		twitter_url = mapping[1]
		twitter_name = getTwitterUsername(twitter_url)
		if twitter_name != "":
			candidates_google.append(google_id)
			candidates_twitter.append(twitter_name)
	ut.writeList2Line(inputPath, "google/ids_mapping", candidates_google) 
	ut.writeList2Line(inputPath, "twitter/names_mapping", candidates_twitter)

Esempio n. 10

0

Mostra file

def buildGraphs(users1, users2):
    # google graph
    print("google graph")
    googleDiGraph = buildDiGraph(interPath + "google/relationship_file",
                                 users1)
    # twitter graph
    print("twitter graph")
    twitterDiGraph = buildDiGraph(
        interPath + "twitter/relationship_file_revise", users2)
    # full graph
    print("full graph")
    allDiGraph = nx.DiGraph()
    allDiGraph.add_edges_from(googleDiGraph.edges())
    allDiGraph.add_edges_from(twitterDiGraph.edges())
    # add mapping info
    gts = ut.readCommaLine2List(interPath, gtStrictFileName)
    for gt in gts:
        googleId = gt[0]
        twitterId = gt[1]
        googleDiGraph.add_node(googleId, {"twitter": twitterId})
        twitterDiGraph.add_node(twitterId, {"google": googleId})
        allDiGraph.add_edge(googleId, twitterId)
        allDiGraph.add_edge(twitterId, googleId)
    # build neighbor mapping list
    for gt in gts:
        googleId = gt[0]
        twitterId = gt[1]
        neighbors_g = googleDiGraph.neighbors(googleId)
        neighbors_t = twitterDiGraph.neighbors(twitterId)
        for n in neighbors_g:
            result = googleDiGraph.node[googleId].get("neighbor_twitter",
                                                      dict())
            if googleDiGraph.node[n].get("twitter", 0) != 0:
                result[n] = googleDiGraph.node[n]["twitter"]
                googleDiGraph.node[googleId]["neighbor_twitter"] = result
        for n in neighbors_t:
            result = twitterDiGraph.node[twitterId].get(
                "neighbor_google", dict())
            if twitterDiGraph.node[n].get("google", 0) != 0:
                result[n] = twitterDiGraph.node[n]["google"]
                twitterDiGraph.node[twitterId]["neighbor_google"] = result
    return googleDiGraph, twitterDiGraph, allDiGraph

Esempio n. 11

0

Mostra file

def getGroundTruth():
    mapping = ut.readCommaLine2List(inputPath, mappingFileName)
    mappingId = list()
    twitterNameId = dict()
    twitterIdName = dict()
    for m in mapping:
        twitterUrl = m[1]
        twitterName = twitterUrl.split("/")[-1].strip()
        googleId = m[0]
        if twitterName == "":
            twitterName = twitterUrl.split("/")[-2]
        if twitterName == "#%21" or "twitter.com" in twitterName or "twitter" == twitterName:
            continue

        # check if the google plus id is a person

        # read twitter profile file to check
        # try:
        # 	location = "../data/google/profile/"+googleId
        # 	with open(location, "r") as fi:
        # 		jresult = json.loads(fi.read())
        # 		if jresult["objectType"]!="person":
        # 			print(googleId)
        # except:
        # 	pass

        # check if the twitter name exist
        try:
            location = inputPath + "twitter/profile/" + twitterName
            with open(location, "r") as fi:
                jresult = json.loads(fi.read())
                twitterId = jresult.get("id_str", 0)
                if twitterId != 0:
                    mappingId.append([googleId, twitterId])
                    twitterNameId[twitterName] = twitterId
                    twitterIdName[twitterId] = twitterName
        except:
            pass
    ut.writeList2CommaLine(interPath, "gt", mappingId)
    ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
    ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)

Esempio n. 12

0

Mostra file

File: process.py Progetto: imsorry1121/sn_crawler

def getGroundTruth():
	mapping = ut.readCommaLine2List(inputPath, mappingFileName)
	mappingId = list()
	twitterNameId = dict()
	twitterIdName = dict()
	for m in mapping:
		twitterUrl = m[1]
		twitterName = twitterUrl.split("/")[-1].strip()
		googleId = m[0]
		if twitterName=="":
			twitterName = twitterUrl.split("/")[-2]
		if twitterName=="#%21" or "twitter.com" in twitterName or "twitter" == twitterName:
			continue

		# check if the google plus id is a person

		# read twitter profile file to check
		# try:
		# 	location = "../data/google/profile/"+googleId
		# 	with open(location, "r") as fi:
		# 		jresult = json.loads(fi.read())
		# 		if jresult["objectType"]!="person":
		# 			print(googleId)
		# except:
		# 	pass

		# check if the twitter name exist
		try:
			location = inputPath+"twitter/profile/"+twitterName
			with open(location, "r") as fi:
				jresult = json.loads(fi.read())
				twitterId = jresult.get("id_str", 0)
				if twitterId != 0:
					mappingId.append([googleId, twitterId])
					twitterNameId[twitterName] = twitterId
					twitterIdName[twitterId] = twitterName
		except:
			pass
	ut.writeList2CommaLine(interPath, "gt", mappingId)
	ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
	ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)

Esempio n. 13

0

Mostra file

def createSNMapping():
    path = "../data/"
    snLists = ut.readCommaLine2List(path, snFile)
    print(len(snLists))
    fbMapping = list()
    twitterMapping = list()
    youtubeMapping = list()
    googleMapping = list()
    for snList in snLists:
        uid = snList[0]
        if snList[1] != "":
            youtubeMapping.append([snList[0], snList[1]])
        if snList[2] != "":
            fbMapping.append([snList[0], snList[2]])
        if snList[3] != "":
            twitterMapping.append([snList[0], snList[3]])
        # if "plus.google" in snList[-1]:
        # 	googleMapping.append([snList])
    print(len(twitterMapping))
    ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping)
    ut.writeList2CommaLine("../data", "fbMapping", fbMapping)
    ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)

Esempio n. 14

0

Mostra file

File: process.py Progetto: imsorry1121/sn_crawler

def createSNMapping():
	path = "../data/"
	snLists = ut.readCommaLine2List(path, snFile)
	print(len(snLists))
	fbMapping = list()
	twitterMapping = list()
	youtubeMapping = list()
	googleMapping = list()
	for snList in snLists:
		uid = snList[0]
		if snList[1] != "":
			youtubeMapping.append([snList[0],snList[1]])
		if snList[2] != "":
			fbMapping.append([snList[0],snList[2]])
		if snList[3] != "":
			twitterMapping.append([snList[0],snList[3]])
		# if "plus.google" in snList[-1]:
		# 	googleMapping.append([snList])
	print(len(twitterMapping))
	ut.writeList2CommaLine("../data", "youtubeMapping", youtubeMapping)
	ut.writeList2CommaLine("../data", "fbMapping", fbMapping)
	ut.writeList2CommaLine("../data", "twitterMapping", twitterMapping)

Esempio n. 15

0

Mostra file

File: feature.py Progetto: imsorry1121/sn_crawler

def buildGraphs(users1, users2):
	# google graph
	print("google graph")
	googleDiGraph = buildDiGraph(interPath+"google/relationship_file", users1)
	# twitter graph 
	print("twitter graph")
	twitterDiGraph = buildDiGraph(interPath+"twitter/relationship_file_revise", users2)
	# full graph
	print("full graph")
	allDiGraph = nx.DiGraph()
	allDiGraph.add_edges_from(googleDiGraph.edges())
	allDiGraph.add_edges_from(twitterDiGraph.edges())
	# add mapping info	
	gts = ut.readCommaLine2List(interPath, gtStrictFileName)
	for gt in gts:
		googleId = gt[0]
		twitterId = gt[1]
		googleDiGraph.add_node(googleId, {"twitter":twitterId})
		twitterDiGraph.add_node(twitterId, {"google":googleId})
		allDiGraph.add_edge(googleId, twitterId)
		allDiGraph.add_edge(twitterId, googleId)
	# build neighbor mapping list
	for gt in gts:
		googleId = gt[0]
		twitterId = gt[1]
		neighbors_g = googleDiGraph.neighbors(googleId)
		neighbors_t = twitterDiGraph.neighbors(twitterId)
		for n in neighbors_g:
			result = googleDiGraph.node[googleId].get("neighbor_twitter", dict())
			if googleDiGraph.node[n].get("twitter", 0)!=0:
				result[n] = googleDiGraph.node[n]["twitter"]
				googleDiGraph.node[googleId]["neighbor_twitter"] = result
		for n in neighbors_t:
			result = twitterDiGraph.node[twitterId].get("neighbor_google", dict())
			if twitterDiGraph.node[n].get("google", 0) !=0:
				result[n] = twitterDiGraph.node[n]["google"]
				twitterDiGraph.node[twitterId]["neighbor_google"] = result
	return googleDiGraph, twitterDiGraph, allDiGraph

Esempio n. 16

0

Mostra file

def getUsersFeatures(procNum=10):
    # init user pair by mapping
    gts = ut.readCommaLine2List(interPath, gtStrictFileName)
    sn1 = "google"
    sn2 = "twitter"
    users_sn1 = list()
    users_sn2 = list()
    # scoresMatrix = lil_matrix((len(gts), len(gts)))
    scoresMatrix = dict()
    for gt in gts:
        users_sn1.append(gt[0])
        users_sn2.append(gt[1])
    # build graph
    print("build graph")
    s = time.time()
    g1, g2, g0 = buildGraphs(users_sn1, users_sn2)
    e = time.time()
    print("build graph over cost: " + str(e - s))
    # for profile using

    print("popular count")
    s = time.time()
    writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2)
    e = time.time()
    print("popular count over cost: " + str(e - s))

    print("calculate features start")
    # calculate features
    s = time.time()
    pairs = [(a, b) for a in range(len(gts)) for b in range(len(gts))
             if b >= a]
    twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
    twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)

    profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData(
        users_sn1, users_sn2, twitterIdName)

    #
    # for pair in pairs:
    # 	print(pair)
    # 	scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0)
    # 	scoresMatrix[(pair[0], pair[1])] = scores

    # parallel
    batchNum = round(len(pairs) / procNum)
    procs = list()
    q = mp.Queue()

    for i in range(procNum):
        batchPairs = list()
        if i == procNum - 1:
            batchPairs = pairs[i * batchNum:]
        else:
            batchPairs = pairs[i * batchNum:(i + 1) * batchNum]
        # p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q))
        p = td.Thread(target=getScoresWorker,
                      args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2,
                            g0, q, profileGoogle, profileTwitter, wallGoogle,
                            wallTwitter, textGoogle, textTwitter))
        p.start()
        procs.append(p)
    print("update start")
    for i in range(len(pairs)):
        print(i)
        result = q.get()
        # scoresMatrix.update(result)
        scoresMatrix[result["key"]] = result["value"]
    print("update over")
    print(len(scoresMatrix))
    for proc in procs:
        proc.join()

    # output feature
    with open(outputPath + featureFileName, "w") as fo:
        for i in range(len(gts)):
            for j in range(len(gts)):
                if i == j:
                    rank = 1
                else:
                    rank = 0
                if i > j:
                    scores = scoresMatrix[(j, i)]
                else:
                    scores = scoresMatrix[(i, j)]
                outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j],
                                          scores)
                fo.write(outputStr)

    # with open(outputPath+featureFileName, "w") as fo:
    # 	for i in range(len(gts)):
    # 		print(users_sn1[i])
    # 		print(i)
    # 		for j in range(len(gts)):
    # 			print(j)
    # 			if i == j:
    # 				rank = 1
    # 			else:
    # 				rank = 0
    # 			scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0)
    # 			outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores)
    # 			fo.write(outputStr)
    e = time.time()
    print("write feature costs:" + str(e - s))

Esempio n. 17

0

Mostra file

File: feature.py Progetto: imsorry1121/sn_crawler

def getUsersFeatures(procNum = 10):
	# init user pair by mapping
	gts = ut.readCommaLine2List(interPath, gtStrictFileName)
	sn1 = "google"
	sn2 = "twitter"
	users_sn1 = list()
	users_sn2 = list()
	# scoresMatrix = lil_matrix((len(gts), len(gts)))
	scoresMatrix = dict()
	for gt in gts:
		users_sn1.append(gt[0])
		users_sn2.append(gt[1])
	# build graph
	print("build graph")
	s = time.time()
	g1, g2, g0 = buildGraphs(users_sn1, users_sn2)
	e = time.time()
	print("build graph over cost: "+str(e-s))
	# for profile using

	print("popular count")
	s = time.time()
	writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2)
	e = time.time()
	print("popular count over cost: "+str(e-s))

	print("calculate features start")
	# calculate features
	s = time.time()
	pairs = [(a,b) for a in range(len(gts)) for b in range(len(gts)) if b>=a]
	twitterIdName = ut.readJson2Dict(interPath, twitterIdNameFileName)
	twitterNameId = ut.readJson2Dict(interPath, twitterNameIdFileName)

	profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter = readData(users_sn1, users_sn2, twitterIdName)

	# 
	# for pair in pairs:
	# 	print(pair)
	# 	scores = getScores(sn1, users_sn1[pair[0]], sn2, users_sn2[pair[1]], g1, g2, g0)
	# 	scoresMatrix[(pair[0], pair[1])] = scores

	# parallel
	batchNum = round(len(pairs)/procNum)
	procs = list()
	q = mp.Queue()

	for i in range(procNum):
		batchPairs = list()
		if i == procNum-1:
			batchPairs = pairs[i*batchNum:]
		else:
			batchPairs = pairs[i*batchNum:(i+1)*batchNum]
		# p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q))
		p = td.Thread(target=getScoresWorker, args=(batchPairs, sn1, users_sn1, sn2, users_sn2, g1, g2, g0, q, profileGoogle, profileTwitter, wallGoogle, wallTwitter, textGoogle, textTwitter))
		p.start()
		procs.append(p)
	print("update start")
	for i in range(len(pairs)):
		print(i)
		result = q.get()
		# scoresMatrix.update(result)
		scoresMatrix[result["key"]] = result["value"]
	print("update over")
	print(len(scoresMatrix))
	for proc in procs:
		proc.join()

	# output feature
	with open(outputPath+featureFileName, "w") as fo:
		for i in range(len(gts)):
			for j in range(len(gts)):
				if i == j:
					rank = 1
				else:
					rank = 0
				if i > j:
					scores = scoresMatrix[(j, i)]
				else:
					scores = scoresMatrix[(i, j)]
				outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores)
				fo.write(outputStr)

	# with open(outputPath+featureFileName, "w") as fo:
	# 	for i in range(len(gts)):
	# 		print(users_sn1[i])
	# 		print(i)
	# 		for j in range(len(gts)):
	# 			print(j)
	# 			if i == j:
	# 				rank = 1
	# 			else:
	# 				rank = 0
	# 			scores = getScores(sn1, users_sn1[i], sn2, users_sn2[j], g1, g2, g0)
	# 			outputStr = getFeatureStr(rank, users_sn1[i], users_sn2[j], scores)
	# 			fo.write(outputStr)
	e = time.time()
	print("write feature costs:" + str(e-s))