def calculate(givenWord,loop,first,second): wordId1 = getId.main(stemming.main(givenWord)) for tweetId in tweetIdList[first:second]: oneTweetWords = tweetId_words[tweetId] for word in oneTweetWords: wordId2 = getId.main(stemming.main(word)) co = cooccurrences.main(wordId1,wordId2) if co > 0: similarityWord = getWordsSimilarity.main(wordId1,wordId2) if similarityWord == None: vector2 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId2))) similarityWord = countSim(vector1,vector2) storeWordsSimilarity.main(wordId1,wordId2,similarityWord)
def main(word): print "Start:",word wordId = getId.main(word) if wordId != None: distributionList = getDistribution.main(wordId) else: distributionList = [] vector = {} for i in range(len(distributionList)): vector[i+1] = distributionList[i] words = vector.keys() values = vector.values() values.sort(reverse=True) fd2 = open("topN/" + word +"_Top" + str(n),"a+") if len(values) > 0: for i in range(n): if float(values[i]) != float(0): id = vector.keys()[vector.values().index(values[i])] vector.pop(id) word = str(id) fd2.write(word) fd2.write(",") fd2.write(str(values[i])) fd2.write("\n") else: break
def main(givenWord): db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') wordId = getId.main(stemming.main(givenWord)) global tweetIdList global tweetId_words global vector1 tweetId_words = {} while(db_mysql.open != 1): db_mysql.ping() cursor = db_mysql.cursor() # removed words itself and love, like, day, lol, today, tomorrow, time, tonight, thing, ready, found,free getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId) + " and A.wordId != 1804507 and A.wordId != 1040690 and A.wordId != 1111170 and A.wordId != 991563 and A.wordId != 13304 and A.wordId != 3368935 and A.wordId != 2113819 and A.wordId != 1990840 and A.wordId != 2977454 and A.wordId != 3489500 and A.wordId != 1326944 and A.wordId != 419686" cursor.execute(getTweetWords) resultsRaw = cursor.fetchall() cursor.close() db_mysql.close() for result in resultsRaw: tweetId = result[0] word = result[1] if tweetId not in tweetId_words: tweetId_words[tweetId] = [] tweetId_words[tweetId].append(word) vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId))) pids = [] tweetIdList = tweetId_words.keys() print "length:",len(tweetIdList) time.sleep(1) tweetIdList.sort() calculate(givenWord,0,0,len(tweetIdList))
def main(word): filePath = "distributionList/" db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','WordsDisambiguation_b4') Id1 = getId.main(word) if not os.path.exists("distributionList/" + str(Id1) + ".List"): db_mysql.ping(True) cursor = db_mysql.cursor() getAllWords = "select Word from TotalOccurrencesAll" cursor.execute(getAllWords) allWords = cursor.fetchall() cursor.close() global element_list element_list = [] for element in allWords: element_list.append(int(element[0])) matrix_hash = json.load(open("../Import/MatrixCPNoOneNoCommonNoSame_b4")) walkNumber = 5 vector1 = stationary_dist(Id1,matrix_hash,element_list,walkNumber) fd = open(filePath + str(Id1) + ".List","w+") valueList = [] for i in range(28498): try: value = vector1[int(i+1)] except: value = '' fd.write(str(value) + "\n") fd.flush() db_mysql.close()
def main(word): filePath = "distributionList/" db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222', 'WordsDisambiguation_b4') Id1 = getId.main(word) if not os.path.exists("distributionList/" + str(Id1) + ".List"): db_mysql.ping(True) cursor = db_mysql.cursor() getAllWords = "select Word from TotalOccurrencesAll" cursor.execute(getAllWords) allWords = cursor.fetchall() cursor.close() global element_list element_list = [] for element in allWords: element_list.append(int(element[0])) matrix_hash = json.load( open("../Import/MatrixCPNoOneNoCommonNoSame_b4")) walkNumber = 5 vector1 = stationary_dist(Id1, matrix_hash, element_list, walkNumber) fd = open(filePath + str(Id1) + ".List", "w+") valueList = [] for i in range(28498): try: value = vector1[int(i + 1)] except: value = '' fd.write(str(value) + "\n") fd.flush() db_mysql.close()
def calculate(givenWord,loop,first,second): fd = open("Luna"+givenWord+"RemoveCo0"+str(loop),"w+") wordId1 = getId.main(stemming.main(givenWord)) for tweetId in tweetIdList[first:second]: similarityFinal = 0 oneTweetWords = tweetId_words[tweetId] length = 0 length = len(oneTweetWords) for word in oneTweetWords: wordId2 = getId.main(stemming.main(word)) co = cooccurrences.main(wordId1,wordId2) if co > 0: similarityWord = getWordsRefinedSimilarity.main(wordId1,wordId2) similarityFinal += float(similarityWord) similarityFinal = float(similarityFinal)/length fd.write(str(tweetId)) fd.write("\t") fd.write(str(float(similarityFinal))) fd.write("\n") fd.flush()
def main(givenWord): wordId = getId.main(givenWord) wordsAll = [] if os.path.exists("filtered/" + givenWord + "_Top" + str(n)): top = open("filtered/" + givenWord + "_Top" + str(n), "r") rawList = top.readlines() for raw in rawList: wordId = raw.strip("\n") if not os.path.exists("distributions/" + str(wordId) + ".List"): wordsAll.append(wordId) return wordsAll
def main(givenWord): wordId = getId.main(stemming.main(givenWord)) db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() Max = 2.62507 update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str(Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL" print update cursor.execute(update) db_mysql.commit() cursor.close() db_mysql.close()
def main(givenWord): wordId = getId.main(stemming.main(givenWord)) db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222', 'CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() Max = 2.62507 update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str( Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL" print update cursor.execute(update) db_mysql.commit() cursor.close() db_mysql.close()
def main(givenWord): wordId = getId.main(stemming.main(givenWord)) global tweetIdList global tweetId_words global vector1 tweetId_words = {} db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId) cursor.execute(getTweetWords) resultsRaw = cursor.fetchall() for result in resultsRaw: tweetId = result[0] word = result[1] if tweetId not in tweetId_words: tweetId_words[tweetId] = [] tweetId_words[tweetId].append(word) cursor.close() db_mysql.close() vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId))) pids = [] tweetIdList = tweetId_words.keys() tweetIdList.sort() for loop in range(8): pid = os.fork() pids.append(pid) if pid == 0 : first = loop*len(tweetIdList)/8 second = (loop+1)*len(tweetIdList)/8 calculate(givenWord,loop,first,second) os._exit(0) else: continue for pid in pids: os.waitpid(pid,0)
#!/usr/bin/python import MySQLdb import getId, stemming def main(word1, word2): db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222', 'CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str( word1) + " AND Word2=" + str(word2) try: cursor.execute(sql) similarity = cursor.fetchone() except: db_mysql.rollback() cursor.close() db_mysql.close() try: similarity = similarity[0] return similarity except: return None words1 = getId.main(stemming.main("popcorn")) words2 = getId.main(stemming.main("microwave")) print main(words1, words2)
#!/usr/bin/python import MySQLdb import getId,stemming def main(word1,word2): db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str(word1) + " AND Word2=" + str(word2) try: cursor.execute(sql) similarity = cursor.fetchone() except: db_mysql.rollback() cursor.close() db_mysql.close() try: similarity = similarity[0] return similarity except: return None words1 = getId.main(stemming.main("popcorn")) words2 = getId.main(stemming.main("microwave")) print main(words1,words2)