def relevancy_score(desiredDoc):
        #Each word has score between 0 to 1 in terms of similarity. "None" is returned
        #there is no similarity. 
        newWord =searchWord + ".n.01" 
        searchWordwn = wn.synset(newWord)
##        print (newWord)
##        print (searchWordwn)
        relevancyScore = 0
        currentWordScore = 0
        memo = {}
        for i in range(len(keywords)):
                currentWord = keywords[i][0]
                if currentWord in memo:
                    currentWordScore = memo[currentWord]
                    if currentWordScore != None:
                        relevancyScore += currentWordScore
                else:
                    if wn.synsets(currentWord, pos = wn.NOUN) != []:
                        currentWordwn = wn.synsets(currentWord, pos = wn.NOUN)[0]
                        currentWordScore = wn.path_similarity(searchWordwn,currentWordwn)
                        memo[currentWord] = currentWordScore

                    if currentWordScore != None:
                        relevancyScore += currentWordScore

        return relevancyScore
def sentence_similarity(idx, ob, mode):

    s_list = list()
    pbar = ProgressBar(widgets=['%s: image ' % mode, SimpleProgress()],
                       maxval=len(sentences)).start()

    for im_idx, sentence_group in enumerate(np.array(sentences)[idx, :]):

        pbar.update(im_idx + 1)
        for sent in sentence_group:

            words = analyze(sent)

            sim = list()
            for w in words:

                syn1 = wn.synsets(w)
                syn2 = wn.synsets(ob)

                if syn1 and syn2:
                    sim.append(max(s1.path_similarity(s2) for (s1, s2)
                                   in product(syn1, syn2)))
                else:
                    # ignore word if no synset combination was found on wordnet
                    sim.append(None)

            if max(sim):
                s_list.append(max(sim))
            else:
                # ignore sentence if no word was similar enough
                s_list.append(float('nan'))

    pbar.finish()
    return s_list
def relation1_old(a, b) :
    ''' This method takes two words as arguments and returns their similarity based on
    wup_similarity method of nltk wordnet.

    Parameters
    ----------
    a : string
    b : string

    Returns
    -------
    float
        relation between two strings

    References
    ----------
    .. [1] NLTK WordNet <http://www.nltk.org/howto/wordnet.html>

    '''
    syna = wn.synsets(a, pos=wn.NOUN)
    synb = wn.synsets(b, pos=wn.NOUN)
    mx = 0
    mxa = None
    mxb = None
    for i in syna[:1] :
        for j in synb[:1] :
            temp = wn.wup_similarity(i, j)
            if temp != None and temp > mx :
                mx = temp
                mxa = i
                mxb = j
    return mx
 def findSimilarity(self):
     #As we recommend only one item first item of this list will be recommended item
     #Second item can be list of items
     '''So what we try to do is get exact synset of first item and get 10 synsets (to reduce computation costs) of second list of items over 
     which the first item was preferred/recommended'''
     
     recommendation = wn.synsets(self.recoItems[0])  # @UndefinedVariable
     recommendationFiltered = []
     for eachSyn in recommendation:
         if self.recoItems[0] in str(eachSyn):
             recommendationFiltered.append(eachSyn)
             
     choices = {}
     for eachItem in self.recoItems[1]:
         choices[eachItem] = wn.synsets(eachItem)[:10]   # @UndefinedVariable getting only 10 items
     
     choiceScores = {}
     for key, value in choices.iteritems():
         choiceScores[key] = []
         for eachValue in choices[key]:            
             for eachRecoSyn in recommendationFiltered:                
                 choiceScores[key].append(eachRecoSyn.path_similarity(eachValue))
                         
     maxChoiceScores = {}
     
     for eachKey in choiceScores.keys():        
         maxChoiceScores[eachKey] = max(choiceScores[eachKey])
     
     return maxChoiceScores
def relation(a,b) :
    ''' Given two words(strings) returns a number that denotes relation between
    the two words.

    Parameters
    ----------
    a : string
    b : string

    Returns
    -------
    float
        relation (less than 1) between two strings

    Notes
    -----
    First it applies BFS on the nltk wordnet and finds the least distance between
    the two given words. If the distance is x the function returns 1/(x+1), else return 0.

    '''
    a = wn.synsets(a)
    b = wn.synsets(b)
    visited_a = set([])
    visited_b = set([])
    stemmed_a = set([])
    stemmed_b = set([])
    depth = 0
    while True:
        if depth > 2:
            return 0
        new_a = set([])
        depth += 1
        for syn in a:
            if stemmer.stem(syn.lemma_names[0]) in stemmed_b:
                return 1.0/depth
            if syn in visited_a:
                continue
            visited_a.add(syn)
            stemmed_a.add(stemmer.stem(syn.lemma_names[0]))
            hyp = set(syn.hyponyms())
            for lemma in syn.lemma_names:
                None
                hyp |= set(wn.synsets(lemma))
            new_a |= hyp
        a = new_a
        new_b = set([])
        depth += 1
        for syn in b:
            if stemmer.stem(syn.lemma_names[0]) in stemmed_a:
                return 1.0/depth
            if syn in visited_b:
                continue
            visited_b.add(syn)
            stemmed_b.add(stemmer.stem(syn.lemma_names[0]))
            hyp = set(syn.hyponyms())
            for lemma in syn.lemma_names:
                None
                hyp |= set(wn.synsets(lemma))
            new_b |= hyp
        b = new_b
def find_nearest_synset(in_tag, in_taglist):
    """
        function find_nearest_synset
          for given in_tag, find its nearest (most similar) tag in in_taglist, return its tag name

        in_tag is a string
        in_taglist is a big list which is produced from   fileparser.parse_imageclef_concepts_wn
    """

    # process the input parameters
    concept_tag = in_taglist[0]
    concept_type = in_taglist[1]
    concept_sense = in_taglist[2]

    numConcepts = len(concept_tag)

    # new a distance matrix
    dist_score = np.zeros([1, numConcepts])

    #loop to calculate the distance between in_tag and each concept
    syn_in_tag = wn.synsets(in_tag)[0]
    for idx in range(numConcepts):
        offset = concept_sense[idx] - 1
        syn_concept = wn.synsets(concept_tag[idx])[offset]

        path_sim = syn_in_tag.path_similarity(syn_concept)
        if path_sim == None:
            path_sim = 0
        # path_sim = compare(in_tag, concept_tag[idx])
        dist_score[0][idx] = path_sim
    # sort in column and flip to descending order
    indices = np.argsort(dist_score, axis=1)
    sorted_indices = np.fliplr(indices)

    return concept_tag[sorted_indices[0][0]]
def parseLyrics2(outlist):
	bandLyricInfo = {} 
	master = [['death', 0],['violence',0],['sacrifice',0],['nature',0],['peace',0],['storm',0],['spirit',0],[ 'dark',0],['scream',0],['pain',0],['blood',0],['flesh',0],['love',0],['greed',0],['poison',0],['anger',0],['revenge',0],['misery',0],['hell',0],['heaven',0],['hate',0],['soul',0],['battle',0],['ghost',0],['joy',0],['light',0],['omen',0],['miracle',0],['magic',0],['universe',0],['disease',0],['god',0],['satan',0],['struggle',0],['heart',0]]
	for key in outlist:
		templist = copy.deepcopy(master) ;
		#key = 'Queensryche'
		raw = outlist[key];
		raw = raw.lower();
		words = re.findall(r'\w+', raw,flags = re.UNICODE | re.LOCALE) # punctuation
		imp_words = filter(lambda x: x not in stopwords.words('english'), words) # filter noise
		lmt = WordNetLemmatizer()
		words_new = [lmt.lemmatize(x) for x in words]
		dw = list(set(words_new))
		
		for word in dw:
			for m in templist:
				p1 = wordnet.synsets(word) ;
				p2 = wordnet.synsets(m[0]) ;
				if(len(p1) >0 and len(p2) >0):
					c = p1[0].wup_similarity(p2[0])
					if(c > m[1]):
						m[1] = c
		# sort words according to similarity
		tnew = sorted(templist,key=lambda val:val[1],reverse=True) [0:10] ;
		# remove the other column
		for l in tnew:
			del l[1]
		print 'Done ',key
		#break ;
		bandLyricInfo[key] = tnew
		#del templist
	return bandLyricInfo
Beispiel #8
0
def get_attributes():
    """
    Gets all attributes for all vehicles ("GROUND * such as VEHICLE")
    """
    wd = Data()

    for vehicle in wd.vehicles.keys():
        if not wn.synsets(vehicle, wn.NOUN):
            del wd.vehicles[vehicle]

    for ground in wd.grounds.keys():
        if not wn.synsets(ground, wn.ADJ):
            del wd.grounds[ground]

    with open('../res/wiki/allsuchas.txt', 'r') as f:

        for line in f:
            split_line = line.split(" such as ")
            left_side = split_line[0]
            right_side = split_line[1]

            for ground in wd.grounds.keys():
                ground_temp = " " + ground + " "
                if ground_temp.replace("_", " ") in left_side:
                    for vehicle in wd.vehicles.keys():
                        vehicle_temp = " a " + vehicle.replace("_", " ") + " "
                        vehicle_plural = " " + vehicle.replace("_", " ") + "s "
                        vehicle_temp = vehicle_temp.replace("_", " ")
                        if vehicle_temp in right_side or vehicle_plural in right_side:

                            wd.get_vehicle(vehicle).add_attribute(wd.get_ground(ground))
                            print str(vehicle) + " " + str(ground) + "  ... " + line

    wd.save()
def parse_file(f):
  for l in f.readlines():    
    word = l.strip()
    synsets = wn.synsets(word)
    
    if word in synonym_values:
      continue
    
    # get first order synonyms
    synonyms = set()
    for synset in synsets:
      synonyms = set(synonyms) | set(synset.lemma_names)
  
    # add in synonyms of those synonyms
    for syn in synonyms:
      for syn_synset in wn.synsets(syn):
        synonyms = set(synonyms) | set(syn_synset.lemma_names)
    
    synonyms_with_values = set(synonyms) & set(synonym_values.keys())
    
    if not len(synonyms_with_values):
      continue
    
    avg = 0
    total = 0
    for syn in synonyms_with_values:
      value = synonym_values[syn]
      avg = (avg * total + float(value)) / (total + 1)
      total += 1
    
    # print "Adding", word, avg
    synonym_values[word] = int(abs_ceil(avg))
  
  f.close()
    def get_similarity(self, string1, string2):
        """
        Calculate the similarity of two statements.
        This is based on the total similarity between
        each word in each sentence.
        """
        import itertools

        tokens1 = self.get_tokens(string1)
        tokens2 = self.get_tokens(string2)

        total_similarity = 0

        # Get the highest matching value for each possible combination of words
        for combination in itertools.product(*[tokens1, tokens2]):

            synset1 = wordnet.synsets(combination[0])
            synset2 = wordnet.synsets(combination[1])

            if synset1 and synset2:

                # Compare the first synset in each list of synsets
                similarity = synset1[0].path_similarity(synset2[0])

                if similarity:  
                    total_similarity = total_similarity + similarity

        return total_similarity
def tell(para1,para2):
	#Strip anything but not alphanum
	para1=re.sub(r'[^\w ]+', '', para1)
	para2=re.sub(r'[^\w ]+', '', para2)
	
	para1=para1.lower().split()
	para2=para2.lower().split()
	
	if para1==[] or para2==[]:
		return 0

	if not filter(lambda t:t.lower() not in stopwords, para1) == []:
		para1=filter(lambda t:t.lower() not in stopwords, para1)
	if not filter(lambda t:t.lower() not in stopwords, para2) == []:
		para2=filter(lambda t:t.lower() not in stopwords, para2)
	
	score=len(set(para1).intersection(para2))
	score_1=float(score)/math.sqrt(len(para2)*len(para1))
	
	para1_with_dictionary=reduce(lambda x,y:x+y, map(lambda word:[l.name for s in wordnet.synsets(word) for l in s.lemmas],para1))
	para1_with_dictionary=map(lambda ele:ele.lower(), para1_with_dictionary)
	#^^ Returns duplicated elements as well. So we need to remove the duplicates. Converting into set does that
	
	para2_with_dictionary=reduce(lambda x,y:x+y, map(lambda word:[l.name for s in wordnet.synsets(word) for l in s.lemmas],para2))
	para2_with_dictionary=map(lambda ele:ele.lower(), para2_with_dictionary)
	
	#^^ Returns duplicated elements as well. So we need to remove the duplicates. While taking intersection the same is handled

	score1=len(set(para1_with_dictionary).intersection(para2))
	score2=len(set(para2_with_dictionary).intersection(para1))

	score_2=float(max(score1,score2))/min(len(para2),len(para1))
	
	score=(score_1+score_2)/2
	return score
Beispiel #12
0
def add_word(word):
    maximum = 0 
    maxJCN = 0
    flag = 0
    for chain in lexical_chains: #for all chains that are present
	for synset in wn.synsets(word): #for all synsets of current word
	    for sense in chain.senses:  #for all senses of the current word in current element of the current chain
	        similarity = sense.wup_similarity(synset) #using wup_similarity
	        
	        if(similarity >= maximum):
	            if similarity >= threshold:
	                #print word, synset, sense, sense.jcn_similarity(synset, brown_ic)
	                JCN = sense.jcn_similarity(synset, brown_ic) #using jcn_similarity
	                if JCN >= jcnTreshold: 
	                    if sense.path_similarity(synset) >= 0.2: #using path similarity
	                        if JCN >= maxJCN:
	                            maximum = similarity
	                            maxJCN = JCN
	                            maxChain = chain
	                            flag = 1
    if flag == 1:	               	                    
        maxChain.addWord(word)
        maxChain.addSense(synset)
        return
		    
    lexical_chains.append(Chain([word], wn.synsets(word)))
Beispiel #13
0
	def hypernyms(self, word, question):
		hyper = []
		sentence = self.parse(question)
		pos = ''
		for sent, tag in sentence[0]: 
			if sent == word:
				pos = tag
				break
		if pos in ['JJ','JJR','JJS']:
			for synset in wn.synsets(word, pos = wn.ADJ):
				for lemma in synset.lemmas():
					if lemma.name() not in hyper and len(hyper)<7:
						hyper.append(lemma.name())
		elif pos in ['NN','NNS']:
			for synset in wn.synsets(word, pos = wn.NOUN):
				for lemma in synset.lemmas():
					if lemma.name() not in hyper and len(hyper)<7:
						hyper.append(lemma.name())
		elif pos in ['VB','VBG','VBD','VBN','VBP','VBZ']:
			for synset in wn.synsets(word, pos = wn.VERB):
				for lemma in synset.lemmas():
					if lemma.name() not in hyper and len(hyper)<7:
						hyper.append(lemma.name())
		elif pos in ['RB','RBR','RBS']:
			for synset in wn.synsets(word, pos = wn.ADV):
				for lemma in synset.lemmas():
					if lemma.name() not in hyper and len(hyper)<7:
						hyper.append(lemma.name())
		return hyper
def subclass(feats):
    if string_match(feats).endswith("False"):
        try:
            result = False
            i_clean = wn.morphy(feats.i_cleaned.lower(), wn.NOUN)
            i_synsets = wn.synsets(i_clean)
            j_clean = wn.morphy(feats.j_cleaned.lower(), wn.NOUN)
            j_synsets = wn.synsets(j_clean)
            def get_common_hypernym(i_synset,j_synset):
                i_hypernyms = i_synset.hypernyms()
                j_hypernyms = j_synset.hypernyms()
                if len(i_hypernyms) == 0:
                    i_synset = i_synset.instance_hypernyms()[0]
                if len(j_hypernyms) == 0:
                    j_synset = j_synset.instance_hypernyms()[0]
                subc = i_synset.common_hypernyms(j_synset)
                return (i_synset in subc) or (j_synset in subc)

            for synset in i_synsets:
                for syn in j_synsets:
                    result = get_common_hypernym(synset,syn)
                    if result: break
                if result:break
            return "subclass={}".format(result)
        except:
            wn_error
            return "subclass={}".format(False)

    else:
        return "subclass={}".format(False)
Beispiel #15
0
def scoreFile(filename, targetWords, verbose=False):
    meanScore = 0.0
    baseWordCount = 0
    wordCount = 0
    f = file(filename)
    for l in f:
        wordScored = False
        fields = [x.strip().lower() for x in re.split(r"\s+", l)]
        if (targetWords is not None) and (fields[0] not in targetWords):
            continue
        baseSynsets = wordnet.synsets(fields[0])
        if baseSynsets is None:
            continue
        for word in fields[1:]:
            # Ignore identical word if it occurs
            if word == fields[0]:
                continue
            targetSynsets = wordnet.synsets(word)
            if targetSynsets is None:
                continue
            wordScore = scoreWord(baseSynsets, targetSynsets)
            meanScore += wordScore
            wordCount += 1
            wordScored = True
        baseWordCount += 1 if wordScored else 0
        if verbose:
            if (baseWordCount > 0) and (baseWordCount % 1000 == 0):
                print "Words scored : %d, Current Score : %f" % (
                    baseWordCount,
                    meanScore / (wordCount if wordCount > 0 else 1),
                )
    f.close()
    meanScore /= wordCount if wordCount > 0 else 1
    return {"baseWordCount": baseWordCount, "totalWordCount": wordCount, "meanScore": meanScore}
Beispiel #16
0
 def compare(self, word1, word2):
     tmp1 = wn.synsets(word1)[0].name
     tmp2 = wn.synsets(word2)[0].name
     w1 = wn.synset(tmp1)
     w2 = wn.synset(tmp2)
     val = w1.wup_similarity(w2)
     return val
def wndist(fs):
    """
    Distance between NP1 and NP2 in WordNet (using the first sense only)
    """

    wndist=-100000

    i_pos=__get_pos__(fs.article,fs.sentence,fs.offset_begin,fs.offset_end)
    j_pos=__get_pos__(fs.article,fs.sentence_ref,fs.offset_begin_ref,fs.offset_end_ref)

    #print "Orig:", fs.token, '\t', fs.token_ref

    if i_pos.startswith('NN') and j_pos.startswith('NN') and not i_pos.endswith('P') and not j_pos.endswith('P'):
        # considering only common nouns
        lemmatizer = nltk.WordNetLemmatizer()
        i=lemmatizer.lemmatize(fs.i_cleaned, pos='n')
        j=lemmatizer.lemmatize(fs.j_cleaned, pos='n')
        synsets_i=wn.synsets(i)
        synsets_j=wn.synsets(j)
        if len(synsets_i)>0 and len(synsets_j)>0:
            wn_sense1_i=synsets_i[0]
            wn_sense1_j=synsets_j[0]
            wn_pos_i=str(wn_sense1_i).split('.')[1]
            wn_pos_j=str(wn_sense1_j).split('.')[1]
            if wn_pos_i==wn_pos_j:
                wndist=wn_sense1_i.lch_similarity(wn_sense1_j)
                wndist=(ceil(wndist * 100) / 100.0)
                #print "Lemmatized:", i, '\t', j, '\t', str(wndist)

    #print
    #print

    return "wndist={}".format(wndist)
Beispiel #18
0
def semantic_similarity(word1, word2):
    words1 = word1.split('_')
    words2 = word2.split('_')
    if fast_semantic_similarity(word1, word2) == 1:
        return 1
    max_p = 0
    word1_sim = set([])
    for s1 in wn.synsets(word1):
        word1_sim.add(s1)
        word1_sim.update(s1.similar_tos())
        # for st1 in [s1] + s1.similar_tos():
        #     word1_sim.append(st1)

    word2_sim = set([])
    for s2 in wn.synsets(word2):
        word2_sim.add(s2)
        word2_sim.update(s2.similar_tos())

    for st1 in word1_sim:
        for st2 in word2_sim:
            p = wn.wup_similarity(st1, st2)
            if p == 1:
                return p
            if p > max_p:
                max_p = p
    if len(words1) > 1 or len(words2) > 1:
        sub_similarity = .9 * semantic_similarity(words1[-1], words2[-1])
    else:
        sub_similarity = 0
    return max(max_p, sub_similarity)
Beispiel #19
0
def processarticle(self, articleid):
	self.update_state(state='PROCESSING', meta={'current': 5, 'total': 100, 'status': 'Downloading article...'})
	wp = wikipedia.page(articleid)
	content = wp.content
	self.update_state(state='PROCESSING', meta={'current': 10, 'total': 100, 'status': 'Processing article...'})
	words = content.split()
	replacecount = 0
	output = ""
	for i in range(0,len(words)):
		word = random.choice(words)
		if len(wn.synsets(word)) >= 1 and checkword(word):
			newword = wn.synsets(word)[0].lemma_names()[0]
			if not checksyn(newword):
				if len(wn.synsets(word)) >= 2:
					newword = wn.synsets(word)[1].lemma_names()[0]
					if not checksyn(newword):
						i -= 2
						continue
				else:
					i -= 2
					continue
			if newword == word:
				i -= 2
				continue
			else:
				content = content.replace(" " + word + " ", " " + newword + " ", 1)
				output += "Replaced " + word + " with " + newword + "\n"
				replacecount += 1
		else:
			i -= 2
		self.update_state(state='PROCESSING', meta={'current': i/100, 'total': 100, 'status': 'Editing article...'})
#		sleep(0.5)
	return {'current': 100, 'total': 100, 'status': 'Processing complete!', 'article': content, 'info': output}
Beispiel #20
0
def syns(q): return set(wn.synsets(q) +  [x for y in wn.synsets(q) for x in  set(
  set(y.closure(lambda a: a.hypernyms(), depth=3)) |
  set(y.closure(lambda a: a.hyponyms(), depth=3)) |
  set(y.closure(lambda a: a.hyponyms() + a.similar_tos(), depth=3))
)]) - ABSTRACT()

def defs(q): return [l.definition for l in wn.synsets(word)]
def SynsetwithCategry():
  hypo = lambda s:s.hyponyms()
  for entry in db.freqbyCtgry.find():
    synsetLists = []
    category = ctgryName.get(entry['category'], entry['category']) 
    if category == 'Other':
      continue
    if category == 'Travel':
      synsetLists.append(getTreesList(wn.synset('travel.n.01').tree(hypo)))
      synsetLists.append(getTreesList(wn.synset('travel.v.03').tree(hypo)))
      synsetLists.append(getTreesList(wn.synset('travel.v.04').tree(hypo)))
      synsetLists.append(getTreesList(wn.synset('travel.v.05').tree(hypo)))
      synsetLists.append(getTreesList(wn.synset('travel.v.06').tree(hypo)))
    else:
      for word in category.split():
        synsets = wn.synsets(word, 'n')   
        synsets += wn.synsets(word, 'v')
        for synset in synsets:
          synsetLists.append(getTreesList(synset.tree(hypo)))
      
    for synsetList in synsetLists:
      for synset in synsetList:
        for lemma in wn.synset(synset[0]).lemmas:
          if db.wordSynsetMap.find({'word': lemma.name, 'category': entry['category']}).count():
            #if the word is in serveral synsets, we can choose the one has the least distance from root
            if db.wordSynsetMap.find({'word': lemma.name, 'category': entry['category']})[0]['depth'] > synset[1]:
              db.wordSynsetMap.remove({'word': lemma.name, 'category': entry['category']})
              db.wordSynsetMap.insert({'word': lemma.name, 'synset': synset[0], 'depth': synset[1], 'category':entry['category']})
              print lemma.name, synset[0], synset[1], entry['category']
          else: 
            db.wordSynsetMap.insert({'word': lemma.name, 'synset': synset[0], 'depth': synset[1], 'category':entry['category']})
            print lemma.name, synset[0], synset[1], entry['category']
Beispiel #22
0
def get_least_specific(n,word_list):
    word_list = [(w, min([synset.min_depth()
            for synset in wn.synsets(w,'n')]))
            for w in word_list if len(wn.synsets(w,'n'))>0]

    return [w for (w,n) in sorted(filter(lambda pair:pair[1]>0,word_list),
                                    key=itemgetter(1))[:n]]
Beispiel #23
0
def c_wn_max_path_similarity(score,word_from,word_to):
	"""
	WordNet path similarity for the most similar synsets. (1 if same word)
	
	This feature can be precomputed by EQUALS
	"""
	
	# Enforce returning 1 when words are equal (would be 0 if synset not found)
	# NOTE: since EQUALS precomputes this feature, the assignment in the second
	#       if is double. It is mantained to keep the indipendence on the imple-
	#       mentation of EQUALS.
	if not score.is_feature_set[score.EQUALS]:
		c_equals(score,word_from,word_to)
	if score.features[score.EQUALS] == 1:
		score.set_feature(score.WN_MAX_PATH_SIMILARITY,1)
		return
	
	# Compute the actual distance
	_r = 0
	
	for ss_from in wn.synsets(word_from.text):
		for ss_to in wn.synsets(word_to.text):
			current_similarity = ss_to.path_similarity(ss_from)
			if current_similarity > _r:
				_r = current_similarity
	
	score.set_feature(score.WN_MAX_PATH_SIMILARITY,_r)
def generatesynsets(table):
    table2 = []
    table3 = {}
    for i in table:
        search1 = "N.*"
        search2 = "V.*"
        if re.findall(search1, i[1]):
            x = wns.synsets(i[0], pos=wns.NOUN)
        elif re.findall(search2, i[1]):
            x = wns.synsets(i[0], pos=wns.VERB)
        for z in range(len(x)):
            for y in x[z].lemma_names:
                syn = 'SYN'
                if y not in ['match', 'be', 'in', 'is']:
                    table2.append((y, syn))
    test = 0
    test += 1
    for i in table2:
        try:
            table3[i] += test

        except:
            table3[i] = test

    return table3
    def polar_values(self, positive_seeds, negative_seeds):
        self.values = []
        POS_tags = list(set(nltk.pos_tag(WordPunctTokenizer().tokenize(self.data))))
        words = []
        for (w, s) in POS_tags:
           w= w.lower()
           POS =  self.get_wordnet_pos(s)
           if POS =='' or re.match("^[\w]+$",w) == None:
               words.append('0')
           else:
               w+="."+POS
               w+=".01"
               words.append(w)
        negative_set = []
        for nw in negative_seeds:
            for s in wordnet.synsets(nw):
                negative_set.append(s)

        positive_set = []
        for pw in positive_seeds:
            for s in wordnet.synsets(pw):
                positive_set.append(s)

        self.eval_words(words, positive_set, negative_set)
        return self.values
Beispiel #26
0
def xhyper(words)->[str]:
    '''returns the highest order x hypernyms'''
    x = UI.request_x()
    print("\nNote: this program will use the first parallel synset if there are any")
    print("\nGathering data...")
    result = [x]
    hyp = lambda w: w.hypernyms()
    #This would pick up the deepest branch's depth -> valueAt returns None -> returns None
    #depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
    for i in range(len(words)):
        synsets = wordnet.synsets(words[i]) 
        if len(synsets) > 0:
            for s in range(len(synsets)):
                hyper = wordnet.synsets(words[i])[s].tree(hyp)
                if (hyper[0].pos() in ['a','s','r']):
                    result.append([words[i], 'None', 'None', [None]])
                    continue
                d = first_depth(hyper) - 1
                xhyper = []
                for j in range(x):
                    xhyper.append(valueAt(d - j, hyper))
                    if xhyper[-1] is None:
                        break
                result.append([words[i], pos_redef(hyper[0].pos()), hyper[0], xhyper])
        else:
            result.append([words[i], 'None', 'None', [None]])
    return result
Beispiel #27
0
def getSynonym(word, tag):
    pos_list = {"JJ":"ADJ","JJR":"ADJ", "JJS":"ADJ","NN":"NOUN","NNS":"NOUN","NPS":"NOUN","NP":"NOUN","RBR":"ADV","RBS":"ADV","RB":"ADV","VB":"VERB","VBD":"VERB","VBG":"VERB","VBN":"VERB","VBP":"VERB","VBZ":"VERB"};
    tag_list = pos_list.keys()
    li = {} 
    if tag in tag_list:
        dd = pos_list.get(tag)
        if dd == "VERB":
            tt =  wn.synsets(word,pos=wn.VERB)
            for key in tt:
                ss = key.lemma_names
                for s in ss:
                    li[s] = s
        if dd == "NOUN":
            tt =  wn.synsets(word,pos=wn.NOUN)
            for key in tt:
                ss = key.lemma_names
                for s in ss:
                    li[s] = s

        if dd == "ADV":
            tt =  wn.synsets(word,pos=wn.ADV)
            for key in tt:
                ss = key.lemma_names
                for s in ss:
                    li[s] = s
        if dd == "ADJ":
            tt =  wn.synsets(word,pos=wn.ADJ)
            for key in tt:
                ss = key.lemma_names
                for s in ss:
                    li[s] = s
    return li.keys()
def CollectSemcorSupersenses():
  oracle_matrix = collections.defaultdict(WordSupersenses)
  for sent in semcor.tagged_sents(tag='both'):
    for chk in sent:
      if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit():
        if chk[0].node.startswith('N'):
          pos = "n"
        elif chk[0].node.startswith('V'):
          pos = "v"
        else:
          continue
        lemmas = chk.node[:-3]
        wnsn = int(chk.node[-2:])
        ssets = wn.synsets(lemmas, pos)
        sorted_ssets = sorted(ssets, key=lambda x: x.name)
        filtered_ssets = None
        for lemma in lemmas.split("_"):  
          if not filtered_ssets or len(filtered_ssets) == 0:
            filtered_ssets = filter(lambda x: lemma in x.name, sorted_ssets)
        if filtered_ssets and len(filtered_ssets) > 0:
          sorted_ssets = filtered_ssets
        try:
          supersense = sorted_ssets[wnsn-1].lexname # prints 'noun.group
        except:
          #print("."),
          continue
        for lemma in lemmas.split("_"):        
          ssets = wn.synsets(lemma, pos)
          if len(ssets) > 0:
            if lemma.isdigit():
              lemma = "0"
            oracle_matrix[lemma].Add(supersense, "semcor")  
  return oracle_matrix      
def userEnteredWordSensor(user_input):
	#what stage are we currently in ? -- whether AS,IM or WI ?
	#what response did user enter ?
	if exactly_right: 
			#save our total action plan.	
			cursor.executeQuery("insert into path values('',session['uid'],session['wordid']")
			pathid=cursor.executeQuery("select pathid from path where wordid = session['wordid']")	
			cursor.executeQuery("insert into waypoint values('',pathid,session['type'],session['waypoint_info'])")
			#LOG the path
			#procede to the next word.
			perform()
			pass;
	elif nearly_right:
			#nearly right means -->
				#one of the tags
				wid=cursor.executeQuery("Select wordid from words where word like 'session['word']'")
				tags=cursor.executeQuery("Select tags from words where wordid=wid")
				for tag in tags:
					if ( tag == word )
						#perform action sequence for NEXT
						break;
					#its synonym
					for s in wn.synsets('session['word']'):
								if( s == user_input )
									#perform action sequence for NEXT
									break;
					else:
					#tags's synonym...?	
							for s in wn.synsets('tag'):
								if( s == user_input )
									#perform action sequence for NEXT
									break;
Beispiel #30
0
def ch03_42_wordnet_semantic_index():
  from nltk.corpus import webtext
  from nltk.corpus import wordnet as wn
  postings = []
  docids = {}
  for (pos, fileid) in enumerate(webtext.fileids()):
    docids[pos] = fileid
    wpos = 0
    words = webtext.words(fileid)
    for word in words:
      try:
        postings.append((word.lower(), (pos, wpos)))
        offset = wn.synsets(word)[0].offset
        postings.append((offset, (pos, wpos)))
        poffset = wn.synsets(word)[0].hypernyms()[0].offset
        postings.append((poffset, (pos, wpos)))
      except IndexError:
        continue
      wpos = wpos + 1
  index = nltk.Index(postings)
  query = "canine"
  qpostings = []
  qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]])
  try:
    offset = wn.synsets(query)[0].offset
    qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]])
  except IndexError:
    pass
  for (pos, wpos) in qpostings:
    left = webtext.words(docids[pos])[wpos-4:wpos]
    right = webtext.words(docids[pos])[wpos:wpos+4]
    print left, right
Beispiel #31
0
 def gloss(self, word):
     if wordnet.synsets(word):
         syn = wordnet.synsets(word)[0]
         return syn.definition()
     else:
         return None
Beispiel #32
0
def get_all_synsets(word, pos=None):
    for ss in wn.synsets(word, pos):
        for lemma in ss.lemma_names():
            # yield (lemma, ss.name())
            yield (lemma, ss)
# synonyms test
from nltk.corpus import wordnet
synonyms = []
for syn in wordnet.synsets('sailing'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)
Beispiel #34
0
def generateQuestion():
        
    
    item = [ 'Pens', 'Books', 'Boxes', 'Chocolates', 'Biscuits', 'Mangos', 'Bananas', 'Dolls', 'Flowers', 'Breads', 'Watches', 'Apples', 'Apricots', 'Avocadoes', 'Blackberries', 'Blueberries', 'Cherries', 'Figs', 'toys', 'kiwi(fruit)', 'lemons', 'oranges','Papers', 'Peaches', 'pears', 'pineapples', 'plums', 'raspberries', 'strawberries', 'watermelons']
    z = (random.choice(item))
    addition = ['originally', 'in the first', 'in the beginning', 'earlier', 'to begin with', 'primitively', 'at first', 'initially', 'incipiently']
    az1 = (random.choice(addition))
    __title__ = 'names'
    __version__ = '0.2'
    __author__ = 'Trey Hunner'
    __license__ = 'MIT'

#

    def multiwordReplace(text, wordDic):
        """
        take a text and replace words that match a key in a dictionary with
        the associated value, return the changed text
        """
        rc = re.compile('|'.join(map(re.escape, wordDic)))

        def translate(match):
            return wordDic[match.group(0)]

        return rc.sub(translate, text)


    p1 = random.randint(2, 9)
    p2 = random.randint(10, 50)
    q = str(p1)
    x = str(p2)


#    def get_name(filename):
#        selected = random.random() * 90
#        with open(filename) as name_file:
#            for line in name_file:
#                name, _, cummulative, _ = line.split()
#                if float(cummulative) > selected:
#                    return name


    def get_first_name(gender=None):
        if gender not in ('male', 'female'):
            gender = random.choice(('male', 'female'))
        return get_name(FILES['first:%s' % gender]).capitalize()


    def get_last_name():
        return get_name(FILES['last']).capitalize()


    def get_full_name(gender=None):
        return u"%s %s" % (get_first_name(gender), get_last_name())


    tn = (names.get_first_name())
    p = (names.get_first_name())

    str1 = """John had some marbles, 
    Jim gave him 3 more, 
    Now John has 8 marbles. 
    How many marbles did John 
    have to begin with ?"""
    # the dictionary has target_word : replacement_word pairs
    # print (str1)
    wordDic = {
        'John': tn,
        'marbles': z,
        'Jim': p,
        '3': q,
        '8': x,
        'to begin with' : az1}
    # call the function and get the changed text
    str2 = multiwordReplace(str1, wordDic)
    str3 = (str2)
    #print (str2)
    output = ""

    #synset library for changing the nouns, vowels etc.
    # Load the pretrained neural net

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    # Tokenize the text
    tokenized = tokenizer.tokenize(str3)

    # Get the list of words from the entire text
    words = word_tokenize(str3)

    # Identify the parts of speech
    tagged = nltk.pos_tag(words)

    

    for i in range(0, len(words)):
        replacements = []

        # Only replace nouns with nouns, vowels with vowels etc.
        for syn in wordnet.synsets(words[i]):

            # Do not attempt to replace proper nouns or determiners
            if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT':
                break

            # The tokenizer returns strings like NNP, VBP etc
            # but the wordnet synonyms has tags like .n.
            # So we extract the first character from NNP ie n
            # then we check if the dictionary word has a .n. or not
            word_type = tagged[i][1][0].lower()
            if syn.name().find("." + word_type + "."):
                # extract the word only
                r = syn.name()[0:syn.name().find(".")]
                replacements.append(r)

        if len(replacements) > 0:
            # Choose a random replacement
            replacement = replacements[randint(0, len(replacements) - 1)]
            output = output + " " + replacement
        else:
            # If no replacement could be found, then just use the
            # original word
            output = output + " " + words[i]
    
    strr = str3
    tn = [int(s) for s in strr.split() if s.isdigit()]
    t[0] = tn[0]
    t[1] = tn[1]
    return strr
def my_get_term_sim(term1, term2, score_level):
    term1 = term1.encode()
    term2 = term2.encode()
    term1 = str.lower(term1).replace('\t', ' ').replace('-',
                                                        ' ').replace(',', '')
    term2 = str.lower(term2).replace('\t', ' ').replace('-',
                                                        ' ').replace(',', '')
    term2.strip()
    term1.strip()

    term1 = [w for w in term1.split(" ") if not w in filter_words]
    term2 = [w for w in term2.split(" ") if not w in filter_words]

    words1 = []
    for word in term1:
        if word is "" or len(wn.synsets(word)) == 0:
            continue
        else:
            words1.append(word)

    words2 = []
    for word in term2:
        if word is "" or len(wn.synsets(word)) == 0:
            continue
        else:
            words2.append(word)
    match = 0.0

    max_i = len(words1)
    max_j = len(words2)
    if max_i <= 0 or max_j <= 0:
        direct = [x for x in term1 if x in term2]
        return 2 * (float(len(direct))) / (len(term1) + len(term2))

    flag_i = -1
    flag_j = -1

    max_step_num = min(max_i, max_j)

    max_sim = []
    for k in range(max_step_num):
        max_temp = -1

        for i in range(k, max_i):
            for j in range(k, max_j):
                sim = get_wordnet_sim(words1[i], words2[j])
                if sim > 1 or sim < 0:
                    print sim
                    raise Exception("Sorry, similarity score is not in [0,1]!")
                if sim > max_temp:
                    flag_i = i
                    flag_j = j
                    max_temp = sim

        temp = words1[flag_i]
        words1[flag_i] = words1[k]
        words1[k] = temp
        temp = words2[flag_j]
        words2[flag_j] = words2[k]
        words2[k] = temp

        max_sim.append(max_temp)

        match += f_identify(max_temp, score_level)

    return 2 * match / (max_i + max_j)
import nltk
from nltk.corpus import wordnet as wn

synonyms_1 = wn.synsets("auto")
print(synonyms_1)
synonyms_2 = wn.synsets("car")
print(synonyms_2)
print()
if len(synonyms_1) == 0 or len(synonyms_2) == 0:
    print("No results")
    # return None, None
else:
    max_sim = -1
    best_pair = None, None
    for synonym in synonyms_1:
        for synonym_2 in synonyms_2:
            sim = synonym.path_similarity(synonym_2)
            print(1 - sim)
            if sim is None:
                continue
            if sim > max_sim:
                max_sim = sim
                best_pair = synonym, synonym_2
    max_sim = 1 - max_sim
    print(best_pair)
    print(max_sim)
    # return best_pair
Beispiel #37
0
from nltk.corpus import wordnet

antonyms = []

for syn in wordnet.synsets("pain"):

    for l in syn.lemmas():

        if l.antonyms():

            antonyms.append(l.antonyms()[0].name())

print(antonyms)
def tps_word_embeddings(
    word_embeddings_name: str,
    neighbourhood_sizes: list,
    semeval_target_words: np.ndarray,
    semeval_target_words_gs_clusters: np.ndarray,
    word_embeddings_normalized: np.ndarray,
    word_to_int: dict,
    word_vocabulary: list,
    num_top_k_words_frequencies: int,
    output_dir: str,
    word_counts: Optional[list] = None,
    ann_instance: ApproxNN = None,
) -> None:
    """
    Computes TPS for word embeddings and saves correlation plots.

    Parameters
    ----------
    word_embeddings_name : str
        Name of the word embeddings.
    neighbourhood_sizes : list
        Neighbourhood sizes to compute TPS scores of.
    semeval_target_words : np.ndarray
        SemEval-2010 task 14 target words.
    semeval_target_words_gs_clusters : np.ndarray
        SemEval-2010 task 14 GS clusters.
    word_embeddings_normalized : np.ndarray
        Normalized word embeddings.
    word_to_int : dict
        Dictionary for mapping a word to its integer representation.
    word_vocabulary : list
        List of words/word ints to use for the vocabulary.
    num_top_k_words_frequencies : list
        Number of top words to use when computing TPS scores vs. word frequencies.
    output_dir : str
        Output directory.
    word_counts : list
        List containing word counts
    ann_instance : ApproxNN
        ApproxNN instance to use for computing TPS scores.
    """
    # Ensure output directory exists
    output_dir_plots = join(output_dir, word_embeddings_name)
    makedirs(output_dir_plots, exist_ok=True)

    # Only use the SemEval-2010 task 14 words in vocabulary
    semeval_target_words_in_vocab_filter = [
        i for i, word in enumerate(semeval_target_words) if word in word_to_int
    ]
    semeval_target_words_in_vocab = semeval_target_words[
        semeval_target_words_in_vocab_filter
    ]
    semeval_target_words_gs_clusters_in_vocab = semeval_target_words_gs_clusters[
        semeval_target_words_in_vocab_filter
    ]

    tps_vs_gs_key = "TPS_n vs. GS"
    tps_vs_synsets_key = "TPS_n vs. synsets"
    tps_vs_frequency_key = "TPS_n vs. frequency"
    result_dict: dict = {
        "n": neighbourhood_sizes,
        tps_vs_gs_key: [],
        tps_vs_synsets_key: [],
    }
    has_word_counts = word_counts is not None
    if has_word_counts:
        result_dict[tps_vs_frequency_key] = []

    for neighbourhood_size in neighbourhood_sizes:
        print(f"-- Neighbourhood size: {neighbourhood_size} --")

        # -- Compute TPS scores and correlation vs GS words --
        output_plot_filepath = join(
            output_dir_plots,
            f"tps_{neighbourhood_size}_vs_gs.pdf",
        )
        output_tps_filepath = join(
            output_dir_plots,
            f"tps_{neighbourhood_size}_vs_gs.npy",
        )
        if not isfile(output_plot_filepath):
            print("Computing TPS scores for GS words")
            tps_scores_semeval = tps_multiple(
                target_words=semeval_target_words_in_vocab,
                word_to_int=word_to_int,
                neighbourhood_size=neighbourhood_size,
                word_embeddings_normalized=word_embeddings_normalized,
                ann_instance=ann_instance,
                n_jobs=-1,
                progressbar_enabled=True,
            )

            # Compute correlation vs GS word meanings
            tps_score_vs_gs_correlation, _ = pearsonr(
                x=tps_scores_semeval, y=semeval_target_words_gs_clusters_in_vocab
            )
            result_dict[tps_vs_gs_key].append(tps_score_vs_gs_correlation)

            # Save plot of TPS scores vs. GS
            tps_word_embeddings_correlation_plot(
                tps_scores=tps_scores_semeval,
                y_values=semeval_target_words_gs_clusters_in_vocab,
                y_label="Clusters in GS",
                tps_vs_y_correlation=tps_score_vs_gs_correlation,
                output_plot_filepath=output_plot_filepath,
                neighbourhood_size=neighbourhood_size,
            )

            # Save TPS scores to file
            np.save(output_tps_filepath, tps_scores_semeval)

        # -- Compute TPS scores and correlation vs Wordnet synsets words --
        output_plot_filepath = join(
            output_dir_plots,
            f"tps_{neighbourhood_size}_vs_synsets.pdf",
        )
        output_tps_filepath = join(
            output_dir_plots,
            f"tps_{neighbourhood_size}_vs_synsets.npy",
        )
        if not isfile(output_plot_filepath):

            # Find words in vocabulary that have synsets in Wordnet
            tps_scores_wordnet_synsets = []
            wordnet_synsets_words_in_vocab = []
            wordnet_synsets_words_in_vocab_meanings = []
            print("Computing TPS scores for words in vocabulary with Wordnet synsets")
            for word in tqdm(word_vocabulary):
                num_synsets_word = len(wn.synsets(word))
                if num_synsets_word > 0:
                    wordnet_synsets_words_in_vocab.append(word)
                    wordnet_synsets_words_in_vocab_meanings.append(num_synsets_word)
            wordnet_synsets_words_in_vocab = np.array(wordnet_synsets_words_in_vocab)

            tps_scores_wordnet_synsets = tps_multiple(
                target_words=wordnet_synsets_words_in_vocab,
                word_to_int=word_to_int,
                neighbourhood_size=neighbourhood_size,
                word_embeddings_normalized=word_embeddings_normalized,
                ann_instance=ann_instance,
                n_jobs=-1,
                progressbar_enabled=True,
            )

            # Compute correlation vs Wordnet synsets
            tps_score_vs_wordnet_synsets_correlation, _ = pearsonr(
                x=tps_scores_wordnet_synsets, y=wordnet_synsets_words_in_vocab_meanings
            )
            result_dict[tps_vs_synsets_key].append(
                tps_score_vs_wordnet_synsets_correlation
            )

            # Save plot of TPS scores vs. Wordnet synsets
            tps_word_embeddings_correlation_plot(
                tps_scores=tps_scores_wordnet_synsets,
                y_values=wordnet_synsets_words_in_vocab_meanings,
                y_label="Synsets in WordNet",
                tps_vs_y_correlation=tps_score_vs_wordnet_synsets_correlation,
                output_plot_filepath=output_plot_filepath,
                neighbourhood_size=neighbourhood_size,
            )

            # Save TPS scores to file
            np.save(output_tps_filepath, tps_scores_wordnet_synsets)

        # -- Compute TPS scores and correlation vs Wordnet synsets words --
        output_plot_filepath = join(
            output_dir_plots,
            f"tps_{neighbourhood_size}_vs_frequency.pdf",
        )
        output_tps_filepath = join(
            output_dir_plots,
            f"tps_{neighbourhood_size}_vs_frequency.npy",
        )
        if has_word_counts and not isfile(output_plot_filepath):
            print(
                f"Computing TPS scores for top {num_top_k_words_frequencies} words vs. word frequencies"
            )
            tps_score_word_frequencies = tps_multiple(
                target_words=word_vocabulary[:num_top_k_words_frequencies],
                word_to_int=word_to_int,
                neighbourhood_size=neighbourhood_size,
                word_embeddings_normalized=word_embeddings_normalized,
                ann_instance=ann_instance,
                n_jobs=-1,
                progressbar_enabled=True,
            )

            # Compute correlation vs Wordnet synsets
            tps_score_vs_word_frequency_correlation, _ = pearsonr(
                x=tps_score_word_frequencies,
                y=word_counts[:num_top_k_words_frequencies],
            )
            result_dict[tps_vs_frequency_key].append(
                tps_score_vs_word_frequency_correlation
            )

            # Save plot of TPS scores vs. word frequencies
            tps_word_embeddings_correlation_plot(
                tps_scores=tps_score_word_frequencies,
                y_values=word_counts[:num_top_k_words_frequencies],
                y_label="Word frequency",
                tps_vs_y_correlation=tps_score_vs_word_frequency_correlation,
                output_plot_filepath=output_plot_filepath,
                neighbourhood_size=neighbourhood_size,
            )

            # Save TPS scores to file
            np.save(output_tps_filepath, tps_score_word_frequencies)
Beispiel #39
0
from nltk.corpus import wordnet

word = input("Enter word : ")
# hypernyms are generic word
hypernyms = []
# hyponyms are specific words than given words
hyponyms = []

for syn in wordnet.synsets(word):

    if syn.hyponyms():
        for hypo in syn.hyponyms():
            if hypo.lemmas():
                for l in hypo.lemmas():
                    hyponyms.append(l.name())

    if syn.hypernyms():
        for hyper in syn.hypernyms():
            if hyper.lemmas():
                for l in hyper.lemmas():
                    hypernyms.append(l.name())

hypernyms = set(hypernyms)
hyponyms = set(hyponyms)

print("hypernyms :", ", ".join(hypernyms))
print("\n")
print("hyponyms :", ", ".join(hyponyms))
Beispiel #40
0
 def pos(self, word):
     if wordnet.synsets(word):
         syn = wordnet.synsets(word)[0]
         return syn.pos()
     else:
         return None
Beispiel #41
0
    def get_verbnet_args(verb, verbose=False):
        lemmatizer = WordNetLemmatizer()
        lemmatized_verb = lemmatizer.lemmatize(verb.lower(), 'v')

        classids = verbnet.classids(lemma=lemmatized_verb)
        if verbose:
            print('Class IDs for "{}": {}'.format(lemmatized_verb, classids))

        if len(classids) < 1:
            if verbose:
                print(
                    'No entry found on verbnet for "{}". Attempting WordNet synsets!'
                    .format(lemmatized_verb))

            wn_synsets = wordnet.synsets(lemmatized_verb)
            for synset in wn_synsets:
                if len(synset.lemmas()) < 1:
                    continue

                candidate = str(synset.lemmas()[0].name())
                classids = verbnet.classids(lemma=candidate)
                if verbose:
                    print('Class IDs for "{}": {}'.format(candidate, classids))

                if len(classids) > 0:
                    break

            if len(classids) < 1:
                if verbose:
                    print(
                        'Unable to find entries on verbnet for neither of the synsets... Will go recursive now (which is not a good thing!)'
                    )

                for synset in wn_synsets:
                    if len(synset.lemmas()) < 1:
                        continue

                    candidate = str(synset.hypernyms()[0].lemmas()[0].name())
                    return NLPUtils.get_verbnet_args(candidate,
                                                     verbose=verbose)

                if verbose:
                    print('Exhausted attempts... returning an empty list.')
                return []

        for id in classids:
            class_number = id[id.find('-') + 1:]
            try:
                v = verbnet.vnclass(class_number)
                roles = [
                    t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE')
                ]
                pass
            except ValueError:
                print('VN class number not found: {}'.format(class_number))

                # Will handle these both below
                v = [None]
                roles = []
                pass

            while len(roles) < 1 and len(v) > 0:
                fallback_class_number = class_number[:class_number.rfind('-')]
                if verbose:
                    print('No roles found for class {}, falling back to {}.'.
                          format(class_number, fallback_class_number))
                class_number = fallback_class_number

                try:
                    v = verbnet.vnclass(class_number)
                    roles = [
                        t.attrib['type']
                        for t in v.findall('THEMROLES/THEMROLE')
                    ]
                    pass
                except ValueError:
                    # Go on with the loop
                    v = [None]
                    roles = []
                    pass

            if len(roles) > 0:
                if verbose:
                    print('Roles found: {}'.format(roles))

                return roles

        return None
        sense_key_regex, sense_key).groups()
    ss_idx = '.'.join([lemma, synset_types[int(ss_type)], lex_id])
    return wn.synset(ss_idx)


#x = "visit%2:38:00::"
#y = "visit%2:41:02::"

#x = "come%2:38:04::"
#y = "come%2:30:01::"

#x = "quit%2:38:00::"
x = "steal%2:38:01::"
#print(synset_from_sense_key(x))
#print(synset_from_sense_key(y))
ls = []
syn = wn.synsets("quit", pos=wn.VERB)
print(syn)
for item in syn:
    print(item)
    ls = ls + wn.synset(item.name()).lemma_names()
    print(wn.synset(item.name()).lemma_names())

print()
print(ls)
print("=============================")
#print(wn.synset("embark.v.02").lemma_names())
#eat = wn.lemma('arrive.v.01.arrive')
#print(eat.key())
#print(wn.synset("come.v.04").lemma_names())
Beispiel #43
0
def openfile(f):
    x = []
    y = []
    finial = []
    i = 2
    csvfile = open(f, 'rb')
    reader = csv.DictReader(csvfile)
    similar = 0.0
    word_count = 0.0
    flag = 0
    text1 = ''
    text2 = ''
    text1_main = ''
    text2_main = ''
    text1_split = []
    text2_split = []
    synonyms = None
    word_count = 0
    similar = 0
    for row in reader:
        synonyms = None
        if i == 0:
            for j in text1_split:
                flag = 0
                if j not in text2_split:
                    synonyms = wordnet.synsets(j)
                    synonyms = set(
                        chain.from_iterable(
                            [word.lemma_names() for word in synonyms]))
                    for k in synonyms:
                        if k in text2_split:
                            flag = 1
                else:
                    similar += 1
                if flag == 1:
                    similar += 1
                word_count += 1

            for j in text2_split:
                flag = 0
                if j not in text1_split:
                    synonyms = wordnet.synsets(j)
                    synonyms = set(
                        chain.from_iterable(
                            [word.lemma_names() for word in synonyms]))
                    for k in synonyms:
                        if k in text1_split:
                            flag = 1
                else:
                    similar += 1
                if flag == 1:
                    similar += 1
                word_count += 1
            print similar / word_count
            i = 2
            word_count = 0
            similar = 0
            similar = 0.0
            word_count = 0.0
            flag = 0
            text1 = ''
            text1_main = ''
            text2 = ''
            text2_main = ''
        elif i == 1:
            text1 = main_text(row['url'])
            text1_split = text1.split(' ')
            text1_split = [x.upper() for x in text1_split if x]
        elif i == 2:
            text2 = main_text(row['url'])
            text2_split = text2.split(' ')
            text2_split = [x.upper() for x in text2_split if x]
    print a
    csvfile.close()
Created on Thu Apr 29 11:12:07 2018

@author: Ayshwarya
"""


import nltk
from nltk.corpus import wordnet 
import re

# READ FILE Railway Station
Inputfile = open("Railway station.txt", "r")
nouns = [] 

# print sample synset
syn =  wordnet.synsets("Animal") 
print(syn[0].lemmas()[0])

# 
File = [ line for line in Inputfile ]
String = '' . join(File)
sentences = re.split(r'[.!?]', String)

# Find out all the noun words in all the sentences of the text
for sentence in sentences:
    for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
        if (pos == 'NNPS' or pos == 'NN' or pos == 'NNS' or pos == 'NNP'):
            nouns.append(word)

print("LIST OF NOUNS IN THE TEXT")
print("-------------------------")
def getWUPSimilarity(w1, w2):
    doc1 = nlp(w1)
    doc2 = nlp(w2)
    if doc1[0].lemma_ == doc2[0].lemma_:
        return 1
    synonyms, _ = getSynAnt(w1)
    if w2 in synonyms:
        return 0.9
    synonyms, _ = getSynAnt(w2)
    if w1 in synonyms:
        return 0.9

    #NOUN
    synw1s = wordnet.synsets(w1, wordnet.NOUN)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
    #VERB
    synw1s = wordnet.synsets(w1, wordnet.VERB)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
    #ADJ
    synw1s = wordnet.synsets(w1, wordnet.ADJ)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
    #ADV
    synw1s = wordnet.synsets(w1, wordnet.ADV)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
Beispiel #46
0
list_synonyms.pop(0)
list_synonyms.pop(0)  # Elimina los titulos
tuplas_synonym_UNESCO.pop(0)  # Elimina los titulos
d = {}
list_synonyms_clear = [
    d.setdefault(x, x) for x in list_synonyms if x not in d
]  # Elimina valores duplicados
list_synonyms_clear.pop()

dict_synonym = {}
dict_synonym_UNESCO = {}
for word in list_synonyms_clear:
    synonyms = []
    cont = 0
    for syn in wordnet.synsets(word, lang='spa'):
        for l in syn.lemmas(lang='spa'):
            synonyms.append(l.name().lower())
            cont += 1
            #if l.antonyms():
            #antonyms.append(l.antonyms()[0].name())
    if cont > 1:
        dict_synonym[word] = set(synonyms)
    else:
        list_tmp = []
        for i in tuplas_synonym_UNESCO:
            if word == i[0]:
                list_tmp.append(i[1])
        if len(list_tmp) > 1:
            dict_synonym[word] = set(list_tmp)
Beispiel #47
0
def wsd_lesk(raw_df, algorithm_choice):
    """This finds the synset of the word using
        the original sentence as context and
        different lesk algorithms from nltk-
        and pywsd-packages.

        Algorithm choices are: 1. nltk's lesk
        2. pywsd simple_lesk, 3. pywsd advanced_lesk."""
    start = timer()
    algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk",
                      3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"}
    df = raw_df
    full_aspect_synset_list = []
    full_aspect_synset_list_definition = []
    aspect_synset_list_definition = []
    aspect_synset_list = []
    opinion_synset_list = []
    opinion_synset_list_definition = []
    full_opinion_synset_list = []
    full_opinion_synset_list_definition = []
    aspect_opinion = ["aspect_tags", "opinion_tags"]
    tokenized_sentences = raw_df["tokenized_sentence"]
    non_tokenized_sentences = raw_df["original_text"]

    for opinion_list in aspect_opinion:
        for i, phrase in enumerate(df[opinion_list]):
            multiple_word_found = False
            for j, word in enumerate(phrase):
                special_word = False
                if multiple_word_found is False:
                    # Check here for special words such as "bug".
                    aspect = check_for_special_word(word)
                    if aspect is not None:
                        special_word = True
                    wn_check = []
                    if len(phrase) >= 2:
                        k = 0
                        temporary_combined_word = []
                        while k < len(phrase):
                            temporary_combined_word.append(phrase[k][0])
                            k += 1
                        combined_word_string = '_'.join(temporary_combined_word)
                        wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1]))
                        multiple_word_found = True
                    if len(wn_check) == 0:
                        wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1]))
                        multiple_word_found = False
                    if len(wn_check) > 0:
                        if special_word is False:
                            if algorithm_choice == 1:
                                if multiple_word_found is True:
                                    aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 2:
                                if multiple_word_found is True:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 3:
                                if multiple_word_found is True:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string,
                                                             find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 4:
                                if multiple_word_found is True:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string,
                                                            find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                        if aspect is not None:
                            if opinion_list is "aspect_tags":
                                aspect_synset_list.append(aspect)
                                aspect_synset_list_definition.append(aspect.definition())
                            else:
                                opinion_synset_list.append(aspect)
                                opinion_synset_list_definition.append(aspect.definition())
            if opinion_list is "aspect_tags":
                full_aspect_synset_list.append(aspect_synset_list)
                full_aspect_synset_list_definition.append(aspect_synset_list_definition)
                aspect_synset_list = []
                aspect_synset_list_definition = []
            else:
                full_opinion_synset_list.append(opinion_synset_list)
                full_opinion_synset_list_definition.append(opinion_synset_list_definition)
                opinion_synset_list = []
                opinion_synset_list_definition = []
    df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values
    df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values
    end = timer()
    logging.debug("WSD Lesk Time: %.2f seconds" % (end - start))
    return df
    if not line:
        break
    line = line.replace('\n', '')
    line = line.split(" ", 1)
    new_line = line[0]
    line[1] = line[1].lower()
    line[1] = line[1].translate(str.maketrans('', '', string.punctuation))
    word_tokens = word_tokenize(line[1])
    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    synonyms = []

    count = 0
    for x in filtered_sentence:

        for syn in wordnet.synsets(x):
            for l in syn.lemmas():
                if (count < 3):
                    if l.name() not in synonyms:
                        synonyms.append(l.name())
                        count += 1

        count = 0

    synonyms_string = ' '.join(synonyms)
    new_line = " ".join([str(new_line), synonyms_string])
    synonyms = []
    fout.write(new_line)
    fout.write('\n')

f.close()
Beispiel #49
0
 def candidates_for_word_type(self, trips, word, pos):
     ss = wn.synsets(word, pos)
     res = {s: self.trips_candidate(trips, s) for s in ss}
     return {s: t for s, t in res.items() if t}
def get_synsets(word):

    synsets = wn.synsets(word)
    return synsets
Beispiel #51
0
    def precisionatk_nltk(self, pred_words, klist):
        '''
        precision at k function which takes into account polysemy
        using NLTK wordmap
        '''
        try:
            import nltk
            nltk.data.path = ['./nltk_data']
            from nltk.corpus import wordnet as wn
        except ImportError:
            raise RuntimeError("Need NLTK for this function.")

        nltk_map = {
            'es': 'spa',
            'fr': 'fra',
            'it': 'ita',
            'en': 'eng',
            'zh': 'cmn'
        }

        def set_correct(correct, val, prediction):
            for i, k in enumerate(klist):
                if len(set(correct) & set(prediction[:k])) > 0:
                    val[i] = 1

        ret_val = 1. * np.zeros_like(klist)

        word_map = {}
        for idx, (src, gold) in enumerate(self.word_map):
            if src not in word_map:
                word_map[src] = ([], pred_words[idx])
            word_map[src][0].append(gold)
        d = len(word_map)

        for word in word_map:
            prediction = self.tgt.ix2word[word_map[word][1]]
            val = np.zeros_like(klist)
            src_word = self.src.ix2word[word]
            gold = self.tgt.ix2word[word_map[word][0]]
            '''
            Normal Dictionary Matching
            '''
            set_correct(gold, val, prediction)
            '''
            Checking if any sense of the gold word matches with the prediction
            '''

            if self.tgt.name not in nltk_map:
                ret_val += val
                continue
            tgt_lang = nltk_map[self.tgt.name]
            synsets = [w for gold_word in gold for w in wn.synsets(gold_word)]
            similar_words = [
                w for synset in synsets for w in synset.lemma_names(tgt_lang)
            ]
            set_correct(similar_words, val, prediction)
            '''
            Checking if the prediction is the translation of any sense of the source word
            '''

            if self.src.name not in nltk_map:
                ret_val += val
                continue
            synsets = wn.synsets(src_word)
            similar_words = [
                w for synset in synsets for w in synset.lemma_names(tgt_lang)
            ]
            set_correct(similar_words, val, prediction)

            ret_val += val

        ret_val *= (100. / d)
        return ret_val, len(set(self.word_map[:, 0]))
Beispiel #52
0
def find_wordnet_synonyms_nouns(noun_synset):
    start = timer()
    original_synset = noun_synset
    synonym_words = []
    # print("Original: %s" % (original_synset))

    # This is for the synonym words from this exact synset.
    # for synonym_word in original_synset.lemma_names():
    #     print("Original: %s synonym: %s" % (
    #     original_synset, synonym_word))
    #     if synonym_word != original_synset.lemma_names()[0]:
    #         synonym_words.append(synonym_word)

    # This is for the synonym synsets that compare
    # against the original synset.
    if original_synset.pos() == "n":
        for synonym_synset in wn.synsets(original_synset.lemma_names()[0], original_synset.pos()):
            # print(synonym)
            if (original_synset != synonym_synset) and (original_synset.lch_similarity(synonym_synset) >= 2.5):
                if synonym_synset.lemma_names()[0] not in synonym_words:
                    synonym_words.append(synonym_synset.lemma_names()[0])
                print("Original: %s other synsets: %s LCH-similarity %s" % (
                    original_synset, synonym_synset, original_synset.lch_similarity(synonym_synset)))
                for nested_hyponym_synset in synonym_synset.hyponyms():
                    if original_synset.lch_similarity(nested_hyponym_synset) >= 2.5:
                        synonym_words.append(nested_hyponym_synset.lemma_names()[0])
                        print("Other synset: %s nested_hyponym words: %s LCH(original) %s" % (synonym_synset, nested_hyponym_synset, original_synset.lch_similarity(nested_hyponym_synset)))

                        # This goes into the hyponyms of hyponyms, seems too deep for now.
                        # for double_nested_hyponym_synset in nested_hyponym_synset.hyponyms():
                        #     print("Hypernym: %s double_nested_hyponym words: %s LCH(original) %s" % (
                        #     nested_hyponym_synset, double_nested_hyponym_synset, original_synset.lch_similarity(double_nested_hyponym_synset)))

                # This iterates first to a higher level, e.g. from Synset computer.n.01
                # to machine.n.01, and then over all the hypernyms from machine.n.01.
                # This doesn't make sense at this level, as it produces too much noise
                # and all the distances are always the same.
                # for hypernym_synset in original_synset.hypernyms():
                #     print("Original: %s nested_hypernym words: %s LCH-similarity %s" % (original_synset, hypernym_synset, original_synset.lch_similarity(hypernym_synset)))
                #     for nested_synonym_synset in hypernym_synset.hyponyms():
                    #     print("Hypernym: %s nested_synonym synset: %s LCH (original&nested) %s" % (hypernym_synset, nested_synonym_synset, original_synset.lch_similarity(nested_synonym_synset)))
        # print("Original: %s other synset words: %s WUP-similarity %s" % (
        #     original_synset, synonym_synset, original_synset.wup_similarity(synonym_synset)))

    # This part deals with adjectives, that
    # have different relations than nouns.
    # if original_synset.pos() == "a":

        # This is for antonyms (opposites e.g. dry-wet), it
        # loops through all synonyms, although antonym seems
        # to be assigned only to the first for the set.
        #     for synonym in original_synset.lemmas():
        #         for antonym in synonym.antonyms():
        #             print("Original: %s antonym: %s" % (
        #                 synonym, antonym))

        # This is for similar adjectives, which are
        # also called satellites:
        # https://wordnet.princeton.edu/documentation/wngloss7wn
        # for similar in original_synset.similar_tos():
        #     print("Original: %s satellite_adjective: %s" % (
        #         original_synset, similar))
        #     synonym_words.append(similar.lemma_names()[0])
    end = timer()
    logging.debug("Wordnet cycle: %.2f seconds" % (end - start))
    return synonym_words
Beispiel #53
0
def run(queryList):

    # stemmer = PorterStemmer()
    stemmer = SnowballStemmer("english")

    f = open("data/expanded.txt", "w+")
    for query in queryList:
        querySplitted = query.split(",")

        # tokenizing the query
        tokens = nltk.word_tokenize(querySplitted[1])

        # removing stop words in the query
        filtered_words = [word for word in tokens if word not in stopwords.words('english')]

        # pos tagging of tokens
        pos = nltk.pos_tag(filtered_words)

        synonyms = []  # synonyms of all the tokens

        index = 0
        # iterating through the tokens
        for item in filtered_words:
            synsets = wordnet.synsets(item)

            if not synsets:
                # stemming the tokens in the query
                synsets = wordnet.synsets(stemmer.stem(item))

            # synonyms of the current token
            currentSynonyms = []
            currentPOS = get_wordnet_pos(pos[index])

            # iterating through the synsets
            for i in synsets:
                # first we check if token and synset have the same part of speech
                if str(i.pos()) == str(currentPOS):
                    for j in i.lemmas():
                        if j.name() not in currentSynonyms:  # if we have not
                            currentSynonyms.append(j.name().replace("_", " "))
                synonyms.append(currentSynonyms)
            index += 1

        f.write(querySplitted[0] + ", " + querySplitted[1] + ", ")

        # removing duplicate lists in the synonyms list
        tmp = []
        for elem in synonyms:
            if elem and elem not in tmp:
                tmp.append(elem)
        synonyms = tmp

        # now that we have all the synonyms
        for x in itertools.product(*synonyms):
            current = ""
            for item in x:
                current += item
                current += " "
            current += ", "
            f.write(current)
        f.write("\n")
Beispiel #54
0
# * **Hiperonimo:** Es un synset mas generalizado que puede abarcar varias palabras. El ejemplo de la clase es que Artefacto es un hiperónimo de vehículo motorizado.
# * **Hiponimo:** Es un synset que no es general sino más específico.
#
# Importamos **Wordnet**
#

# In[16]:

nltk.download('omw')
from nltk.corpus import wordnet as wn

# **synset:** grupo de sinómimos de una palabra.

# In[17]:

ss = wn.synsets('carro', lang='spa')
ss

# Explorando los synsets

# In[18]:

for syn in ss:
    print(syn.name(), ': ', syn.definition())
    for name in syn.lemma_names():
        print(' * ', name)

# ### visualization references
#
# [Visualizing WordNet relationships as graphs](http://www.randomhacks.net/2009/12/29/visualizing-wordnet-relationships-as-graphs/)
#
Beispiel #55
0
def iterate(df):

    correct_count = 0
    wrong_count = 0
    logloss = 0

    for index, row in df.iterrows():

        res = row["is_duplicate"]
        terms1 = get_terms(row["question1"])
        terms2 = get_terms(row["question2"])

        sims = []

        for word1 in terms1:
            word1_sim = []

            try:
                syn1 = wn.synsets(word1)[0]
            except:
                sims.append([0 for i in range(0, len(terms2))])
                continue

            for word2 in terms2:

                try:
                    syn2 = wn.synsets(word2)[0]
                except:
                    word1_sim.append(0)
                    continue

                word_similarity = syn1.wup_similarity(syn2)
                word1_sim.append(word_similarity)

            sims.append(word1_sim)

        # print sims

        word1_score = 0

        for i in range(0, len(terms1), 1):
            try:
                word1_score += max(sims[i])
            except:
                continue
        word1_score /= len(terms1)

        word2_score = 0

        for i in range(0, len(terms2), 1):
            try:
                word2_score += max([j[i] for j in sims])
            except:
                continue
        word2_score /= len(terms2)

        pair_score = (word1_score + word2_score) / 2

        if res == 1:
            logloss += math.log(pair_score)

        if (pair_score > 0.5):
            pred = 1
        else:
            pred = 0

        if pred == res:
            correct_count += 1
        else:
            wrong_count += 1

        if index % 100 == 0:
            print correct_count, wrong_count
            print logloss / (correct_count + wrong_count)
Beispiel #56
0
def get_antoynm(x):
    prefix = []
    f = open("semantic/prefixes.txt", "r")
    prefix = f.readlines()

    for i in range(0, len(prefix) - 1):
        prefix[i] = prefix[i][:-1]
    f.close()
    #print(prefix)
    dic_words, dic_ant = d.extract_more_antonyms()
    xx = []
    ant_a = []
    ant_n = []
    ant_v = []
    temp = []
    for syn in wn.synsets(str(x)):
        xx.append(chunck(syn.name(), 0))
    #print(xx)
    for synx in xx:
        for syn in wn.synsets(synx):
            for l in syn.lemmas():
                string = l.name()
                #print("jgdkgj" + string)
                if string in dic_words:
                    #print("morun1")
                    index = dic_words.index(string)
                    ant_v.append(dic_ant[index])
                    #print("!")
                    #print(dic_ant[index])
                    #print(index)
                if string in dic_ant:
                    #print("morun2")
                    index = dic_ant.index(string)
                    ant_v.append(dic_words[index])
                    #print("!!")
                    #print(dic_words[index])

                if l.antonyms():
                    i = 0
                    while i < len(l.antonyms()):
                        n = chunck(str(l.antonyms()[i]), 1)
                        if str(l.antonyms()[i].name()).startswith(
                                "re", 0, 2) and str(
                                    l.antonyms()[i].name()) in prefix:
                            string = str(l.antonyms()[i].name())
                            temp.append(string[2:])
                        elif (str(l.antonyms()[i].name()).startswith(
                                "un", 0, 2)
                              or str(l.antonyms()[i].name()).startswith(
                                  "ir", 0, 2)
                              or str(l.antonyms()[i].name()).startswith(
                                  "il", 0, 2)
                              or str(l.antonyms()[i].name()).startswith(
                                  "im", 0, 2)
                              or str(l.antonyms()[i].name()).startswith(
                                  "non", 0, 3)
                              or str(l.antonyms()[i].name()).startswith(
                                  "in", 0, 2)) and str(
                                      l.antonyms()[i].name()) in prefix:
                            temp.append(str(l.antonyms()[i].name()))
                        else:
                            if n == "a" or n == "s":
                                ant_a.append(l.antonyms()[i].name())
                            if n == "n":
                                ant_n.append(l.antonyms()[i].name())
                            else:
                                ant_v.append(l.antonyms()[i].name())
                        i += 1

    #print(ant_a)
    #print(ant_n)
    #print(ant_v)
    #print(temp)

    if len(ant_a) == 0 and len(ant_v) == 0 and len(ant_n) == 0 and len(
            temp) == 0:
        return None
    else:
        if len(ant_a) >= 1 and n != "v":
            c = Counter(ant_a)
            for i in c.elements():
                print(i, c[i])
                return i

        if len(ant_n) >= 1 and n != "v":
            c = Counter(ant_n)
            for i in c.elements():
                print(i, c[i])
                return i

        if len(ant_v) >= 1:
            c = Counter(ant_v)
            for i in c.elements():
                print(i, c[i])
                return i
        else:
            ## read from the list of prefixes
            c = Counter(temp)
            for i in c.elements():
                print(i, c[i])
                return i
Beispiel #57
0
import nltk
from nltk.corpus import wordnet

synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))
Beispiel #58
0
tokens = word_tokenize(text_file)
# print tokens

# whitespace tokenizer
from nltk.tokenize import regexp_tokenize
tokenizer = regexp_tokenize(text_file, '\s+', gaps=True)
# print tokenizer

from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = tokenizer
# print [word for word in words if word not in english_stops]

#look up words and print synset
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
print syn.name()
print syn.definition()
print syn.hypernyms()
print syn.hypernyms()[0].hyponyms()
print syn.root_hypernyms()
print syn.hypernym_paths()

#
# for w in words:
#     print w
#     syn = wordnet.synsets(w)
#     if (type(syn) == 'list'):
#         syn = syn[0]
#     # print syn
#     if (len(syn) != 0):
Beispiel #59
0
spa_nodes = deepcopy(pickled_graph.nodes(data=True))

for term_ind, node in spa_nodes:
    term = dictionary[node['term_id']]

    # syn nodes have already been created, the values are in in the list
    if term in syns_per_term:
        for syn in syns_per_term[term]:
            pickled_graph.add_edge(term_ind,
                                   syn_to_node_map[syn_dict.token2id[syn]],
                                   attr_dict={'weight': 0.5})

    # synonyms have not been created for this term
    else:
        # get syns for term
        syns = wn.synsets(term)

        for syn_obj in syns:
            # extracts the text value from the syn object
            syn = syn_obj.name().split('.')[0]

            # We have not seen this syn yet
            if syn not in syn_dict.token2id:
                # add syn term to dictionary
                syn_dict.add_documents([[syn]])

                # add syn node to graph
                pickled_graph.add_node(node_count,
                                       type='SYN',
                                       term_id=syn_dict.token2id[syn],
                                       freq_per_doc=-1,
                with open(input_file, 'r', encoding='utf-8') as file:
                    for text in file:
                        text = re.sub(r"^AdvertisementSupported.*— ", '', text)
                        text = ''.join([i for i in text if not i.isdigit()])

                        if 'RT @' in text:
                            text = text[4:]
                        
                        text = clean_tweet_url(text)
                        text = re.sub(r"([:=;X][oO\-]?[D\)\]\(\]/\\OpP]) ", '', text)
                        emojis = extract_emojis(text)
                        tweet_xx = clean_tweet(text)

                        for em in emojis[:]:
                            tweet_xx = re.sub(em,'',tweet_xx)
                        tweet_xx=re.sub(emoji_pattern,'',tweet_xx)

                        stop = set(stopwords.words('english'))
                        sentence =  " ".join([word.lower() for word in word_tokenize(tweet_xx) if word.lower() not in stop])
                        sentence =  " ".join([word for word in word_tokenize(sentence) if wordnet.synsets(word)])
                        sentence = " ".join([stem(word) for word in word_tokenize(sentence)])
                        textOutFile.writelines(sentence + '\n')
                    
                textOutFile.close()
            except BaseException as e:
                print('processing file: %s' % filename)
                print("Error while search: %s" % str(e))
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print(exc_type, fname, exc_tb.tb_lineno)
                continue