コード例 #1
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def getGroundTruth():
	mapping = ut.readCommaLine2List(inputPath, mappingFileName)
	mappingIdLoose = list()
	mappingIdStrict = list()
	twitterNameId = dict()
	twitterIdName = dict()
	mappingLoss = list()
	for m in mapping:
		twitterUrl = m[1]
		twitterName = getTwitterUsername(twitterUrl)
		googleId = m[0]

		if twitterName == "":
			continue
		(google_profile_bool, google_posts_bool) = checkGoogleData(googleId)
		(twitter_profile_bool, twitter_posts_bool, twitter_profile) = checkTwitterData(twitterName)

		if google_profile_bool == False or twitter_profile_bool == False:
			mappingLoss.append(m)
		else:
			twitterId = twitter_profile.get("id_str", 0)
			if google_posts_bool == False or twitter_posts_bool == False:
				mappingIdLoose.append([googleId, twitterId])
			else:
				mappingIdLoose.append([googleId, twitterId])
				mappingIdStrict.append([googleId, twitterId])
			twitterIdName[twitterId] = twitterName
			twitterNameId[twitterName] = twitterId
	ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose)
	ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict)
	ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
	ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
	ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
コード例 #2
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def structUserData(sn, uid):
	print(uid)
	# norm profile
	profile = ut.readJson2Dict(inputPath+sn+"/profile/", uid)
	posts = ut.readJson2Dict(inputPath+sn+"/wall/", uid)

	print("profile:"+interPath+sn+"/profile/"+uid)
	newProfile = normProfile(sn, profile)
	print("wall:"+interPath+sn+"/wall/"+uid)
	newPosts = normWall(sn, posts)

	ut.writeDict2Json(interPath+sn+"/profile/", uid, newProfile)
	ut.writeDict2Json(interPath+sn+"/wall/", uid, newPosts)

	# wall statisitcs
	langDistri = ut.getDistri([post["lang"] for post in newPosts])
	# sentiment sum
	sentiments = [post["sentiment"]["polarity"] for post in newPosts]
	sentiment_score = sum(sentiments)/len(sentiments) if len(sentiments)>0 else 0
	# topic sum
	topicDistris = [post["topic_distri"] for post in newPosts]
	userTopicDistri = ut.mergeDict(topicDistris)
	userTopicDistri = ut.normVector(userTopicDistri)
	# tf
	tfs = [post["tf"] for post in newPosts]
	userTf = ut.mergeDict(tfs)
	return (userTf, langDistri, sentiment_score, userTopicDistri)
コード例 #3
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def structUserData(sn, uid):
    print(uid)
    # norm profile
    profile = ut.readJson2Dict(inputPath + sn + "/profile/", uid)
    posts = ut.readJson2Dict(inputPath + sn + "/wall/", uid)

    print("profile:" + interPath + sn + "/profile/" + uid)
    newProfile = normProfile(sn, profile)
    print("wall:" + interPath + sn + "/wall/" + uid)
    newPosts = normWall(sn, posts)

    ut.writeDict2Json(interPath + sn + "/profile/", uid, newProfile)
    ut.writeDict2Json(interPath + sn + "/wall/", uid, newPosts)

    # wall statisitcs
    langDistri = ut.getDistri([post["lang"] for post in newPosts])
    # sentiment sum
    sentiments = [post["sentiment"]["polarity"] for post in newPosts]
    sentiment_score = sum(sentiments) / len(sentiments) if len(
        sentiments) > 0 else 0
    # topic sum
    topicDistris = [post["topic_distri"] for post in newPosts]
    userTopicDistri = ut.mergeDict(topicDistris)
    userTopicDistri = ut.normVector(userTopicDistri)
    # tf
    tfs = [post["tf"] for post in newPosts]
    userTf = ut.mergeDict(tfs)
    return (userTf, langDistri, sentiment_score, userTopicDistri)
コード例 #4
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def getGroundTruth():
    mapping = ut.readCommaLine2List(inputPath, mappingFileName)
    mappingIdLoose = list()
    mappingIdStrict = list()
    twitterNameId = dict()
    twitterIdName = dict()
    mappingLoss = list()
    for m in mapping:
        twitterUrl = m[1]
        twitterName = getTwitterUsername(twitterUrl)
        googleId = m[0]

        if twitterName == "":
            continue
        (google_profile_bool, google_posts_bool) = checkGoogleData(googleId)
        (twitter_profile_bool, twitter_posts_bool,
         twitter_profile) = checkTwitterData(twitterName)

        if google_profile_bool == False or twitter_profile_bool == False:
            mappingLoss.append(m)
        else:
            twitterId = twitter_profile.get("id_str", 0)
            if google_posts_bool == False or twitter_posts_bool == False:
                mappingIdLoose.append([googleId, twitterId])
            else:
                mappingIdLoose.append([googleId, twitterId])
                mappingIdStrict.append([googleId, twitterId])
            twitterIdName[twitterId] = twitterName
            twitterNameId[twitterName] = twitterId
    ut.writeList2CommaLine(interPath, gtLooseFileName, mappingIdLoose)
    ut.writeList2CommaLine(interPath, gtStrictFileName, mappingIdStrict)
    ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
    ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
    ut.writeList2CommaLine(interPath, mappingLossFileName, mappingLoss)
コード例 #5
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def writeTextStat(usersTf, usersLangDistri, idf, sn, usersSentimentScore,
                  usersTopicDistri):
    for user, tf in usersTf.items():
        tf_top5 = list()
        tfidf_top5 = list()
        result = dict()
        norm = float()
        result["tf_top5"] = sorted(tf.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)[:5]
        for term, fre in tf.items():
            usersTf[user][term] = fre * idf[term]
            norm += math.pow(usersTf[user][term], 2)
        # unit vector
        norm = math.sqrt(norm)
        for term in tf.keys():
            usersTf[user][term] = usersTf[user][term] / norm
        result["tfidf_top5"] = sorted(usersTf[user].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)[:5]
        result["tfidf"] = usersTf[user]
        result["lang_distri"] = usersLangDistri[user]
        if len(result["lang_distri"]) > 0:
            result["lang"] = max(usersLangDistri[user].items(),
                                 key=operator.itemgetter(1))[0]
        else:
            result["lang"] = "none"
        result["sentiment"] = usersSentimentScore[user]
        result["topic_distri"] = usersTopicDistri[user]
        ut.writeDict2Json(interPath + sn + "/text", user, result)
コード例 #6
0
ファイル: feature.py プロジェクト: imsorry1121/sn_crawler
def writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2):
	in_degree_sn1 = list()
	in_degree_sn2 = list()
	for user in users_sn1:
		in_degree_sn1.append(g1.in_degree(user))
	for user in users_sn2:
		in_degree_sn2.append(g2.in_degree(user))
	result = {sn1:max(in_degree_sn1), sn2:max(in_degree_sn2)}
	ut.writeDict2Json(interPath, popularCountFileName, result)
コード例 #7
0
def writeMostPopularCount(g1, sn1, users_sn1, g2, sn2, users_sn2):
    in_degree_sn1 = list()
    in_degree_sn2 = list()
    for user in users_sn1:
        in_degree_sn1.append(g1.in_degree(user))
    for user in users_sn2:
        in_degree_sn2.append(g2.in_degree(user))
    result = {sn1: max(in_degree_sn1), sn2: max(in_degree_sn2)}
    ut.writeDict2Json(interPath, popularCountFileName, result)
コード例 #8
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2, usersSentimentScore1, usersSentimentScore2, usersTopicDistri1, usersTopicDistri2):
	# build dictionary and idf
	idf = dict()
	for user, tf in usersTf1.items():
		for term in tf:
			idf[term] = idf.get(term, 0) +1
	for user, tf in usersTf2.items():
		for term in tf:
			idf[term] = idf.get(term, 0) +1
	n = len(usersTf1) * 2
	for term, df in idf.items():
		idf[term] = math.log(n/df)
	# write dictionary
	ut.writeDict2Json(interPath, "idf.json", idf)
	ut.writeList2Json(interPath, "dictionary.txt", sorted(idf.keys()))
	# write unit vector
	writeTextStat(usersTf1, usersLangDistri1, idf, sn1, usersSentimentScore1, usersTopicDistri1)
	writeTextStat(usersTf2, usersLangDistri2, idf, sn2, usersSentimentScore2, usersTopicDistri2)
コード例 #9
0
ファイル: process.py プロジェクト: imsorry1121/sn_crawler
def getGroundTruth():
	mapping = ut.readCommaLine2List(inputPath, mappingFileName)
	mappingId = list()
	twitterNameId = dict()
	twitterIdName = dict()
	for m in mapping:
		twitterUrl = m[1]
		twitterName = twitterUrl.split("/")[-1].strip()
		googleId = m[0]
		if twitterName=="":
			twitterName = twitterUrl.split("/")[-2]
		if twitterName=="#%21" or "twitter.com" in twitterName or "twitter" == twitterName:
			continue

		# check if the google plus id is a person

		# read twitter profile file to check
		# try:
		# 	location = "../data/google/profile/"+googleId
		# 	with open(location, "r") as fi:
		# 		jresult = json.loads(fi.read())
		# 		if jresult["objectType"]!="person":
		# 			print(googleId)
		# except:
		# 	pass

		# check if the twitter name exist
		try:
			location = inputPath+"twitter/profile/"+twitterName
			with open(location, "r") as fi:
				jresult = json.loads(fi.read())
				twitterId = jresult.get("id_str", 0)
				if twitterId != 0:
					mappingId.append([googleId, twitterId])
					twitterNameId[twitterName] = twitterId
					twitterIdName[twitterId] = twitterName
		except:
			pass
	ut.writeList2CommaLine(interPath, "gt", mappingId)
	ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
	ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
コード例 #10
0
def getGroundTruth():
    mapping = ut.readCommaLine2List(inputPath, mappingFileName)
    mappingId = list()
    twitterNameId = dict()
    twitterIdName = dict()
    for m in mapping:
        twitterUrl = m[1]
        twitterName = twitterUrl.split("/")[-1].strip()
        googleId = m[0]
        if twitterName == "":
            twitterName = twitterUrl.split("/")[-2]
        if twitterName == "#%21" or "twitter.com" in twitterName or "twitter" == twitterName:
            continue

        # check if the google plus id is a person

        # read twitter profile file to check
        # try:
        # 	location = "../data/google/profile/"+googleId
        # 	with open(location, "r") as fi:
        # 		jresult = json.loads(fi.read())
        # 		if jresult["objectType"]!="person":
        # 			print(googleId)
        # except:
        # 	pass

        # check if the twitter name exist
        try:
            location = inputPath + "twitter/profile/" + twitterName
            with open(location, "r") as fi:
                jresult = json.loads(fi.read())
                twitterId = jresult.get("id_str", 0)
                if twitterId != 0:
                    mappingId.append([googleId, twitterId])
                    twitterNameId[twitterName] = twitterId
                    twitterIdName[twitterId] = twitterName
        except:
            pass
    ut.writeList2CommaLine(interPath, "gt", mappingId)
    ut.writeDict2Json(interPath, twitterNameIdFileName, twitterNameId)
    ut.writeDict2Json(interPath, twitterIdNameFileName, twitterIdName)
コード例 #11
0
ファイル: preprocess.py プロジェクト: sychen1121/sn_crawler
def writeStatWalls(usersTf1, usersTf2, usersLangDistri1, usersLangDistri2,
                   usersSentimentScore1, usersSentimentScore2,
                   usersTopicDistri1, usersTopicDistri2):
    # build dictionary and idf
    idf = dict()
    for user, tf in usersTf1.items():
        for term in tf:
            idf[term] = idf.get(term, 0) + 1
    for user, tf in usersTf2.items():
        for term in tf:
            idf[term] = idf.get(term, 0) + 1
    n = len(usersTf1) * 2
    for term, df in idf.items():
        idf[term] = math.log(n / df)
    # write dictionary
    ut.writeDict2Json(interPath, "idf.json", idf)
    ut.writeList2Json(interPath, "dictionary.txt", sorted(idf.keys()))
    # write unit vector
    writeTextStat(usersTf1, usersLangDistri1, idf, sn1, usersSentimentScore1,
                  usersTopicDistri1)
    writeTextStat(usersTf2, usersLangDistri2, idf, sn2, usersSentimentScore2,
                  usersTopicDistri2)
コード例 #12
0
ファイル: preprocess.py プロジェクト: imsorry1121/sn_crawler
def writeTextStat(usersTf, usersLangDistri, idf, sn, usersSentimentScore, usersTopicDistri):
	for user, tf in usersTf.items():
		tf_top5 = list()
		tfidf_top5 = list()
		result = dict()
		norm = float()
		result["tf_top5"] = sorted(tf.items(), key=operator.itemgetter(1), reverse=True)[:5]
		for term, fre in tf.items():
			usersTf[user][term] = fre * idf[term]
			norm += math.pow(usersTf[user][term], 2)
		# unit vector
		norm = math.sqrt(norm)
		for term in tf.keys():
			usersTf[user][term] = usersTf[user][term]/norm
		result["tfidf_top5"] = sorted(usersTf[user].items(), key=operator.itemgetter(1), reverse=True)[:5]
		result["tfidf"] = usersTf[user]
		result["lang_distri"] = usersLangDistri[user]
		if len(result["lang_distri"])>0:
			result["lang"] = max(usersLangDistri[user].items(), key=operator.itemgetter(1))[0]
		else:
			result["lang"] = "none"
		result["sentiment"] = usersSentimentScore[user]
		result["topic_distri"] = usersTopicDistri[user]
		ut.writeDict2Json(interPath+sn+"/text",user,result)