Beispiel #1
0
def calculate(givenWord,loop,first,second):
	wordId1 = getId.main(stemming.main(givenWord))
	for tweetId in tweetIdList[first:second]:
		oneTweetWords = tweetId_words[tweetId]
		for word in oneTweetWords:
			wordId2 = getId.main(stemming.main(word))
			co = cooccurrences.main(wordId1,wordId2)
			if co > 0:
				similarityWord = getWordsSimilarity.main(wordId1,wordId2)
				if similarityWord == None:
					vector2 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId2)))
					similarityWord = countSim(vector1,vector2)
					storeWordsSimilarity.main(wordId1,wordId2,similarityWord)
Beispiel #2
0
def main(word):
	print "Start:",word
	wordId = getId.main(word)
	if wordId != None:
		distributionList = getDistribution.main(wordId)
	else:	
		distributionList = []
	vector = {}
	for i in range(len(distributionList)):
		vector[i+1] = distributionList[i]
	words = vector.keys()
	values = vector.values()
	values.sort(reverse=True)
	fd2 = open("topN/" + word +"_Top" + str(n),"a+")
	if len(values) > 0:
		for i in range(n):
			if float(values[i]) != float(0):
				id = vector.keys()[vector.values().index(values[i])]
				vector.pop(id)
				word = str(id)
				fd2.write(word)
				fd2.write(",")
				fd2.write(str(values[i]))
				fd2.write("\n")
			else:
				break
Beispiel #3
0
def main(givenWord):
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	wordId = getId.main(stemming.main(givenWord))
	global tweetIdList
	global tweetId_words
	global vector1
	tweetId_words = {}
	while(db_mysql.open != 1):
		db_mysql.ping()
	cursor = db_mysql.cursor()
	# removed words itself and love, like, day, lol, today, tomorrow, time, tonight, thing, ready, found,free
	getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId) + " and A.wordId != 1804507 and A.wordId != 1040690 and A.wordId != 1111170 and A.wordId != 991563 and A.wordId != 13304 and A.wordId != 3368935 and A.wordId != 2113819 and A.wordId != 1990840 and A.wordId != 2977454 and A.wordId != 3489500 and A.wordId != 1326944 and A.wordId != 419686"
	cursor.execute(getTweetWords)
	resultsRaw = cursor.fetchall()
	cursor.close()
	db_mysql.close()
	for result in resultsRaw:
		tweetId = result[0]
		word = result[1]
		if tweetId not in tweetId_words:
			tweetId_words[tweetId] = []
		tweetId_words[tweetId].append(word)


	vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId)))


	pids = []
	tweetIdList = tweetId_words.keys()
	print "length:",len(tweetIdList)
	time.sleep(1)
	tweetIdList.sort()
	calculate(givenWord,0,0,len(tweetIdList))
def main(word):
	filePath = "distributionList/"
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','WordsDisambiguation_b4')
	Id1 = getId.main(word)
	if not os.path.exists("distributionList/" + str(Id1) + ".List"):
			db_mysql.ping(True)
			cursor = db_mysql.cursor()
			getAllWords = "select Word from TotalOccurrencesAll"
			cursor.execute(getAllWords)
			allWords = cursor.fetchall()
			cursor.close()

			global element_list
			element_list = []
			for element in allWords:
				element_list.append(int(element[0]))
			matrix_hash = json.load(open("../Import/MatrixCPNoOneNoCommonNoSame_b4"))
			
			walkNumber = 5
			vector1 = stationary_dist(Id1,matrix_hash,element_list,walkNumber)
			fd = open(filePath + str(Id1) + ".List","w+")
			valueList = []
			for i in range(28498):
				try:
					value = vector1[int(i+1)]
				except:
					value = ''
				fd.write(str(value) + "\n")
				fd.flush()
	db_mysql.close()
Beispiel #5
0
def main(word):
    filePath = "distributionList/"
    db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222',
                               'WordsDisambiguation_b4')
    Id1 = getId.main(word)
    if not os.path.exists("distributionList/" + str(Id1) + ".List"):
        db_mysql.ping(True)
        cursor = db_mysql.cursor()
        getAllWords = "select Word from TotalOccurrencesAll"
        cursor.execute(getAllWords)
        allWords = cursor.fetchall()
        cursor.close()

        global element_list
        element_list = []
        for element in allWords:
            element_list.append(int(element[0]))
        matrix_hash = json.load(
            open("../Import/MatrixCPNoOneNoCommonNoSame_b4"))

        walkNumber = 5
        vector1 = stationary_dist(Id1, matrix_hash, element_list, walkNumber)
        fd = open(filePath + str(Id1) + ".List", "w+")
        valueList = []
        for i in range(28498):
            try:
                value = vector1[int(i + 1)]
            except:
                value = ''
            fd.write(str(value) + "\n")
            fd.flush()
    db_mysql.close()
Beispiel #6
0
def calculate(givenWord,loop,first,second):
	fd = open("Luna"+givenWord+"RemoveCo0"+str(loop),"w+")
	wordId1 = getId.main(stemming.main(givenWord))
	for tweetId in tweetIdList[first:second]:
		similarityFinal = 0
		oneTweetWords = tweetId_words[tweetId]
		length = 0
		length = len(oneTweetWords)
		for word in oneTweetWords:
			wordId2 = getId.main(stemming.main(word))
			co = cooccurrences.main(wordId1,wordId2)
			if co > 0:
				similarityWord = getWordsRefinedSimilarity.main(wordId1,wordId2)
				similarityFinal += float(similarityWord)
		similarityFinal = float(similarityFinal)/length
		fd.write(str(tweetId))
		fd.write("\t")
		fd.write(str(float(similarityFinal)))
		fd.write("\n")
		fd.flush()
def main(givenWord):
    wordId = getId.main(givenWord)
    wordsAll = []
    if os.path.exists("filtered/" + givenWord + "_Top" + str(n)):
        top = open("filtered/" + givenWord + "_Top" + str(n), "r")
        rawList = top.readlines()
        for raw in rawList:
            wordId = raw.strip("\n")
            if not os.path.exists("distributions/" + str(wordId) + ".List"):
                wordsAll.append(wordId)
    return wordsAll
def main(givenWord):
	wordId = getId.main(stemming.main(givenWord))
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	db_mysql.ping()
	cursor = db_mysql.cursor()
	Max = 2.62507
	update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str(Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL"  
	print update
	cursor.execute(update)
	db_mysql.commit()
	cursor.close()
	db_mysql.close()
def main(givenWord):
    wordId = getId.main(stemming.main(givenWord))
    db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222',
                               'CikmTwitterDataSet')
    db_mysql.ping()
    cursor = db_mysql.cursor()
    Max = 2.62507
    update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str(
        Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL"
    print update
    cursor.execute(update)
    db_mysql.commit()
    cursor.close()
    db_mysql.close()
Beispiel #10
0
def main(givenWord):
	wordId = getId.main(stemming.main(givenWord))
	global tweetIdList
	global tweetId_words
	global vector1
	tweetId_words = {}
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	db_mysql.ping()
	cursor = db_mysql.cursor()
	getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId)
	cursor.execute(getTweetWords)
	resultsRaw = cursor.fetchall()
	for result in resultsRaw:
		tweetId = result[0]
		word = result[1]
		if tweetId not in tweetId_words:
			tweetId_words[tweetId] = []
		tweetId_words[tweetId].append(word)
	cursor.close()
	db_mysql.close()


	vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId)))


	pids = []
	tweetIdList = tweetId_words.keys()
	tweetIdList.sort()
	for loop in range(8):
		pid = os.fork()
		pids.append(pid)
		if pid == 0 :
			first = loop*len(tweetIdList)/8
			second = (loop+1)*len(tweetIdList)/8
			calculate(givenWord,loop,first,second)
			os._exit(0)
		else:
			continue
	for pid in pids:
		os.waitpid(pid,0)
Beispiel #11
0
#!/usr/bin/python
import MySQLdb
import getId, stemming


def main(word1, word2):
    db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222',
                               'CikmTwitterDataSet')
    db_mysql.ping()
    cursor = db_mysql.cursor()
    sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str(
        word1) + " AND Word2=" + str(word2)
    try:
        cursor.execute(sql)
        similarity = cursor.fetchone()
    except:
        db_mysql.rollback()
    cursor.close()
    db_mysql.close()
    try:
        similarity = similarity[0]
        return similarity
    except:
        return None


words1 = getId.main(stemming.main("popcorn"))
words2 = getId.main(stemming.main("microwave"))
print main(words1, words2)
#!/usr/bin/python
import MySQLdb
import getId,stemming

def main(word1,word2):
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	db_mysql.ping()
	cursor = db_mysql.cursor()
	sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str(word1) + " AND Word2=" + str(word2)
	try:
		cursor.execute(sql)
		similarity = cursor.fetchone()
	except:
		db_mysql.rollback()
	cursor.close()
	db_mysql.close()
	try:
		similarity = similarity[0]
		return similarity
	except:
		return None


words1 = getId.main(stemming.main("popcorn"))
words2 = getId.main(stemming.main("microwave"))
print main(words1,words2)