def synsetFrequency(freqEntry): category = freqEntry['category'] newWordlist = {} wordToSynsetMap = {} for word in freqEntry['wordlist']: if dbsoesvm.wordSynsetMap.find({'category':category, 'word': word}).count(): synset = dbsoesvm.wordSynsetMap.find({'category':category, 'word': word})[0]['synset'].replace('.','__') newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word] else: cnt = Counter({synset: sum(dbsoesvm.wordKfirf.find({'category':category})[0]['wordlist'].get(lemma.name, 0) for lemma in synset.lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(category, category))}) synset = cnt.most_common()[0][0].name.replace('.','__') newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word] wordToSynsetMap[word] = synset.replace('__','.') freqEntry['wordlist'] = newWordlist print freqEntry return freqEntry, wordToSynsetMap, category
def frequencySynset(db): db.synsetFrequency.drop() query = {} f = open('XXXX', 'w') for entry in db.frequency.find(query, timeout = False): newWordlist = {} for word in entry['wordlist']: if db.wordSynsetMap.find({'word': word, 'category': entry['category']}).count(): synset = db.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset'] newWordlist[synset.replace('.','__')] = newWordlist.get(synset.replace('.','__'), 0) + entry['wordlist'][word] else: print 'XXX' f.write(word+' '+entry['category']+'\n') #because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it. cnt = Counter({synet: sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))}) synset = cnt.most_common()[0] newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word] entry['wordlist'] = newWordlist db.synsetFrequency.insert(entry)