def wordToSynset(db, isInit = False): if isInit: db.wordSynsetMap.drop() else: db.wordSynsetMap.remove({'category':'Travel'}) if isInit: query = {} else: query = {'category':'Travel'} for entry in db.freqbyCtgry.find(query): synsetWordMap = {} for word in entry['wordlist']: for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category'])): if not synsetWordMap.has_key(synset.name): synsetWordMap[synset.name] = set([word]) else: synsetWordMap[synset.name].add(word) synsetKfirfSumMap = Counter({k:sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'][word] for word in synsetWordMap[k]) for k in synsetWordMap}) for pair in synsetKfirfSumMap.most_common(): mostSynset = pair[0] for word in synsetWordMap[mostSynset]: #Actually we can only insert <word synset> never inserted before. if db.wordSynsetMap.find({'word': word, 'synset': mostSynset, 'category': entry['category']}).count() == 0: db.wordSynsetMap.insert({'word': word, 'synset': mostSynset, 'category': entry['category'], 'depth': 100}) """
def wordToSynset(): #todo: should use freqinctgry(only train) for entry in db.freqbyCtgry.find(): synsetWordMap = {} for word in entry['wordlist']: for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category'])): if not synsetWordMap.has_key(synset.name): synsetWordMap[synset.name] = set([word]) else: synsetWordMap[synset.name].add(word) synsetKfirfSumMap = Counter({k:sum(db.kfirfbyCtgry.find({'category':entry['category']})[0]['wordlist'][word] for word in synsetWordMap[k]) for k in synsetWordMap}) for pair in synsetKfirfSumMap.most_common(): mostSynset = pair[0] for word in synsetWordMap[mostSynset]: db.wordSynsetMap.insert({'word': word, 'synset': mostSynset, 'category': entry['category'], 'depth': 100}) mostSynsetWordSet = synsetWordMap.pop(mostSynset) #the synsetWordMap changed for assignment need, while the synsetKfirfSumMap does not change. for synset in synsetWordMap: synsetWordMap[synset] = synsetWordMap[synset] - mostSynsetWordSet
def frequencySynset(): f = open('XXXX','w') for entry in dbRepo.frequency.find(): newWordlist = {} for word in entry['wordlist']: if word not in keeplist: continue if dbRepo.wordSynsetMap.find({'word': word, 'category': entry['category']}).count(): synset = dbRepo.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset'] newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word] else: f.write(word+' '+entry['category']+'\n') #because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it. cnt = Counter({synet: sum(dbRepo.kfirfbyCtgry.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))}) synset = cnt.most_common()[0] newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word] entry['wordlist'] = newWordlist dbRepo.synsetFrequency.insert(entry)
def synsetFrequency(freqEntry): category = freqEntry['category'] newWordlist = {} wordToSynsetMap = {} for word in freqEntry['wordlist']: if dbsoesvm.wordSynsetMap.find({'category':category, 'word': word}).count(): synset = dbsoesvm.wordSynsetMap.find({'category':category, 'word': word})[0]['synset'].replace('.','__') newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word] else: cnt = Counter({synset: sum(dbsoesvm.wordKfirf.find({'category':category})[0]['wordlist'].get(lemma.name, 0) for lemma in synset.lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(category, category))}) synset = cnt.most_common()[0][0].name.replace('.','__') newWordlist[synset] = newWordlist.get(synset, 0) + freqEntry['wordlist'][word] wordToSynsetMap[word] = synset.replace('__','.') freqEntry['wordlist'] = newWordlist print freqEntry return freqEntry, wordToSynsetMap, category
def frequencySynset(db): db.synsetFrequency.drop() query = {} f = open('XXXX', 'w') for entry in db.frequency.find(query, timeout = False): newWordlist = {} for word in entry['wordlist']: if db.wordSynsetMap.find({'word': word, 'category': entry['category']}).count(): synset = db.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset'] newWordlist[synset.replace('.','__')] = newWordlist.get(synset.replace('.','__'), 0) + entry['wordlist'][word] else: print 'XXX' f.write(word+' '+entry['category']+'\n') #because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it. cnt = Counter({synet: sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))}) synset = cnt.most_common()[0] newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word] entry['wordlist'] = newWordlist db.synsetFrequency.insert(entry)