Esempio n. 1
0
def getRightSyns2(word, tokenized, pos1, sentence, fdist):
	pos = pos1[0:2]
	otherDict = {'VB':"simV.lsp", 'JJ':"simA.lsp", 'RB':"simA.lsp",'NN':"simN.lsp"}
	toReturn = None
	if (pos in otherDict):
		myPos = otherDict[pos]
		source = ourLesk(sentence, word, None)
		
		
		if source is not None:
			synonyms = sorted(thes.scored_synonyms(word, fileid = myPos),key=lambda x: x[1], reverse=True)[0:9]
			if len(synonyms) > 0:
				
				finalList = []
				for synonym in synonyms:
					
					code = ourLesk(sentence, synonym[0], None)
					if code is not None:
						if source == code:
							finalList.append(synonym[0])
				if len(finalList) != 0:
					
					toReturn = finalList

	return toReturn	
Esempio n. 2
0
	def GetSynonyms(self, word):
		try:
			# check, if the result is not cached
			current_key = self._get_configuration_key()
			if current_key not in self.__found_synonyms:
				self.__found_synonyms[current_key] = {}

			if word in self.__found_synonyms[current_key]:
				return self.__found_synonyms[current_key][word]

			# it is not, so find synonyms
			wordLemma = word
			if self.__use_lemma:
				wordLemma = self.__lemmatizer.lemmatize(word)
			results = thes.scored_synonyms(wordLemma, fileid=self.__fileId)
			if len(results) > 0:
				sorted_results = sorted(results, key=lambda cell: cell[1], reverse=True)
				final_results = [w[0].lower() for w in sorted_results if (sorted_results[0][1] - w[1]) <= self.__similarity]
				if self.__max_words > 0 and len(final_results) > self.__max_words:
					final_results = final_results[:self.__max_words]
				if word.lower() not in final_results:
					final_results.append(word)

				self.__found_synonyms[current_key][word] = final_results
				return final_results
			else: # no synonym found
				return [word]

			return filtered_results
		except: # check, if any synonym has been found (if not, return only this word)
			return [word]
def prevalent_sense(word,pos=wn.NOUN):
    #determine the prevalence
    if pos=='a':
        lin_type='simA.lsp'
    elif pos=='v':
        lin_type='simV.lsp'
    else:
        lin_type='simN.lsp'

    distthes = lin.scored_synonyms(word, fileid=lin_type)
    sortedthes = sorted(distthes, key=operator.itemgetter(1), reverse=True)[0:k]

    scores = {}  #dict for the scores for each sense which will be contributed to by each neighbour

    for wnsynset in wn.synsets(word,pos):
        #initialise scores for each synset as 0
        print "wnsynset.name"
        ##bow.n.01
        print wnsynset.name
        scores[wnsynset.name] = 0


    print "sortedthes:(lin scored_synonyms)"
    print sortedthes
    for (neigh, dss) in sortedthes:
        print "neigh:%r" % neigh
        if len(wn.synsets(neigh))>0: #check neighbour is in WN otherwise all sims will be 0
            sum = 0  #this will be the sum of wnss scores for this distributional neighbour (summed over all senses)
            neighscores = {}  #this stores the wnss scores for each sense for this neighbour
                            #it could be a list with index corresponding to WN synset number
                            #will need to divide by sum and times by dss before adding to the sum over all distributional neighbours for each sense
            print "pos%r" % pos
            print "word%r" % word
            for wnsynset in wn.synsets(word,pos): # word = film
                #print "wnsynset:%r" % wnsynset
                wnss_score = wnss(neigh, wnsynset,pos=pos)#look up wnss score for this neighbour and this sense
                sum += wnss_score  #add it to the sum over all senses for the neighbour
                neighscores[wnsynset.name] = wnss_score  #store it in dictionary so that each value can later be divided by sum
            print "neighscores,sum,dss"
            print neighscores,sum,dss
            for wnsynset in wn.synsets(word,pos):#second loop is needed to divide by sum (which is not known until completion of first loop)
                                                #sum will be different for each neighbour so it is not a constant which can be ignored
                scores[wnsynset.name] += dss * neighscores[wnsynset.name] / sum  #weight the score for each sense (according to this neighbour)
                                                                    # by its dss score and inversely by the sum of wnss scores for this neighbour
                                                                    # and add to the total for this sense
        else:
            print "Warning: ignoring distributional neighbour "+neigh+" : not in WordNet as noun"  #this is likely to happen when distributional neighbours are proper nouns see 'hull' example
                                                #probably should modifiy code so that it is the top k neighbours excluding words not in WN


    print "scores.tiems"
    print scores.items()
    scoreslist = [scoretuple for scoretuple in scores.items()]
    sortedscores = sorted(scoreslist, key=operator.itemgetter(1), reverse=True)
    print "sortedscores"
    print sortedscores
    return sortedscores[0]
Esempio n. 4
0
 def get_similar_words(self, term):
     if term in self.terms_dict.keys():
         return self.terms_dict[term]
     scored_synonyms = lin_thesaurus.scored_synonyms(term,
                                                     fileid="simN.lsp")
     best_2 = sorted(scored_synonyms, key=lambda x: x[1], reverse=True)[:2]
     best_2_list = [tup[0] for tup in best_2]
     self.terms_dict[term] = best_2_list
     return best_2_list
    def do_thesaurus(query):

        lowered = []
        toAdd = set()
        # lower every word in query
        for word in query:
            lowered.append(word.lower())

        # Go over every word in query
        for word in lowered:
            counterNoMoreThen4 = 0

            dictionary = thes.synonyms(word)[1][1]

            # find similar expressions and their scores
            listOfScores = thes.scored_synonyms(word)[1][1]
            dictOfScored = dict(listOfScores)
            # print("\n word: ",word)
            # print(dictOfScored)
            # print(dictionary)

            # Go over the thesaurus words
            #for idx, syn in enumerate(dictionary):
            #    related.append(syn)

            # Go over the scored dictionary
            for key in dictOfScored:

                # Check if similar enough and no more then 4 per word
                if dictOfScored[key] > 0.21 and key not in lowered and counterNoMoreThen4 < 4:
                    counterNoMoreThen4 += 1

                    # if the similar term contains ' '
                    if key.__contains__(' '):
                        splited = key.split()

                        # add only relevant term
                        for term in splited:
                            if term not in lowered:
                                toAdd.add(term)
                    else:
                        toAdd.add(key)
                elif counterNoMoreThen4 == 4:
                    # Too many terms for word
                    continue
            #print("word: ",word," similar:",list(toAdd))

        # Lower term in listToAdd
        listToAdd = list(toAdd)
        for i, term in enumerate(toAdd):
            listToAdd[i] = term.lower()

        #print("list: ", listToAdd)
        #print("how much: ", len(listToAdd))
        return listToAdd
 def expand_query(self, parsed_query) -> list:
     """
     expands query based on synonyms given from thesaurus package.
     :param parsed_query:
     :return:
     """
     terms_from_expansion = []
     for term in parsed_query:
         scored = thes.scored_synonyms(term)[1][1]
         terms_from_expansion += [
             k for k, v in sorted(
                 scored, key=lambda item: item[1], reverse=True)
         ][:17]
     return terms_from_expansion + parsed_query
Esempio n. 7
0
def getRightSyns3(word, tokenized, pos1, sentence, fdist):
	pos = pos1[0:2]
	otherDict = {'VB':"simV.lsp", 'JJ':"simA.lsp", 'RB':"simA.lsp",'NN':"simN.lsp"}
	
	if pos in otherDict:
		myPos = otherDict[pos]
		
		synonyms = sorted(thes.scored_synonyms(word, fileid = myPos),key=lambda x: x[1], reverse=True)[0:4]
		if len(synonyms) > 0:
			return [synonym[0] for synonym in synonyms]
		else:
			return None
	else:
		return None
Esempio n. 8
0
def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print("Getting synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting scored synonyms for " + word1)
    print(thes.scored_synonyms(word1))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print(f"Similarity score for {word1} and {word2}:")
    print(thes.similarity(word1, word2))
Esempio n. 9
0
def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print("Getting synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting scored synonyms for " + word1)
    print(thes.scored_synonyms(word1))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Similarity score for %s and %s:" % (word1, word2))
    print(thes.similarity(word1, word2))
    def get_lin_terms(self, term, n, pos):
#       sys.stderr.write("DEBUG  Getting scored synonyms of term %s, POS %s\n" % (term, pos))
        syns = thes.scored_synonyms(term)
#       sys.stderr.write("DEBUG  Here are the synsets returned from the Lin thesaurus: %s\n" % syns)

        all_syns = []
        # syn list is in the form ((POS, [syn, syn, syn]), (POS, [syn, syn, syn]) ...)
        # concatenate all synonyms from the various lists (see TODO for possible change)
        for element in syns:
            if element[0] == pos:
                all_syns.extend(element[1])
#       sys.stderr.write("DEBUG  Here are all the synonyms: %s\n" % all_syns)
        
        if len(all_syns) > n:
#           sys.stderr.write("DEBUG  Found more synonyms than required; filtering by similarity measure\n")
            # get n-best synonyms according to Lin similarity
            top = heapq.nlargest(n, all_syns, key = lambda k: k[1])
        else:
#           sys.stderr.write("DEBUG  Synonyms found do not exceed max number of synonyms desired; skipping filtering step\n")
            top = all_syns
#       sys.stderr.write("DEBUG  Here are the top %s synonyms: %s\n" % (n, top))
        return top
Esempio n. 11
0
def demo():
	word1 = "business"
	word2 = "enterprise"
#refer to: 
	'''
	print("Getting synonyms for " + word1)
	print(thes.synonyms(word1))

	print("Getting scored synonyms for " + word1)
	print(thes.synonyms(word1))

	print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
	print(thes.synonyms(word1, fileid="simN.lsp"))

	print("Getting scored synonyms for " + word1)
	print(thes.scored_synonyms(word1)
	print("Similarity score for %s and %s:" % (word1, word2))
	print(thes.similarity(word1, word2))
	'''
	sql_handler = mysql_handler('scam', 'scam', 'test')
	sql_handler.connect()
	result = sql_handler.do_query('select key_word from email_trie\
			where occur_time > 30', True)
	for line in result:
		kw = line[0]
		print "searching for synonyms for key_word:  ", kw
		synonyms = thes.scored_synonyms(kw)
		for field in synonyms:
			field_name = field[0]
			for syn_score in field[1]:
				syn = syn_score[0]
				score = syn_score[1] 
				param = '("' + kw + '","' + field_name + '","' + syn + '",' + str(score) + ')'
				print "executing: insert into synonyms (key_word, field_name, syn, score) values " + param
				try:
					sql_handler.do_query("insert into synonyms (key_word, field_name, syn, score) values " + param)
				except:
					continue
Esempio n. 12
0
def lin_synonyms(word, pos):
    fileid = 'sim%s.lsp' % pos.upper()
    thes_entry = lin.scored_synonyms(word, fileid=fileid)
    thes_entry = sorted(thes_entry, key=(lambda x: x[1]), reverse=True)
    # return words ordered by score
    return [syn for syn, score in thes_entry]
Esempio n. 13
0
import nltk
from nltk.corpus import wordnet
from nltk.corpus import lin_thesaurus as cs

#syns = cs.scored_synonyms('pillage')
#print syns
#syn = cs.synonyms('pillage')
for entry in cs.scored_synonyms('pillage'):
    print entry[0]
    score = 0.0
    word = ""
    for words in entry[1]:
        #print words[0], words[1]
        if score < words[1]:
            score = words[1]
            word = words[0]
    print word, score
print cs.scored_synonyms('pillage')

#meaning = wordnet.synsets('pillage')[0].lemmas()[0].name()
#print wordnet.synsets('pillage')[0].lch_similarity(meaning, "n")
#print syns
#print dir(wordnet.synsets('pillage'))
#print meaning

#print wordnet.wup_similiarity(wordnet.synsets('pillage'), meaning)
#w1 = wordnet.synset('run.v.01')
#w2 = wordnet.synset('sprint.v.01')
#print w1.wup_similarity(w2)
Esempio n. 14
0
__author__ = 'arkanath'

from nltk.corpus import lin_thesaurus as thes

print(thes.scored_synonyms('scorn'))
Esempio n. 15
0
 def lin_synonyms(word, pos):
     fileid = 'sim%s.lsp' % ('N' if pos is None else pos.upper())
     thes_entry = lin_thesaurus.scored_synonyms(word, fileid=fileid)
     thes_entry = sorted(thes_entry, key = (lambda x : x[1]), reverse = True)
     return [syn for syn, score in thes_entry]