def getSimilarity(s1, s2, word_order=False):

    try:
        s1_wsd = disambiguate(s1)  # using default disambiguation
        s2_wsd = disambiguate(s2)
    except TypeError:
        print("s2:", s1)
        sys.exit(0)
    # remove None synsets
    s1_wsd = [syn for syn in s1_wsd if syn[1]]
    s2_wsd = [syn for syn in s2_wsd if syn[1]]

    #vector_length = max(len(s1_wsd), len(s2_wsd))
    try:
        L1, L2 = _synset_similarity(s1_wsd, s2_wsd)
        V1 = np.array([max(L1[key]) for key in L1.keys()])
        V2 = np.array([max(L2[key]) for key in L2.keys()])
        S = np.linalg.norm(V1) * np.linalg.norm(V2)
        C1 = sum(V1 >= benchmark_similarity)
        C2 = sum(V2 >= benchmark_similarity)

        Xi = (C1 + C2) / gamma

        if C1 + C2 == 0:
            Xi = max(V1.size, V2.size) / 2.0

        sem_similarity = S / Xi
    except ValueError:
        sem_similarity = 0
    # computing word order similarity
    word_ord_similarity = 0
    delta = 1.0
    if word_order:
        tokens1 = word_tokenize(s1)
        tokens2 = word_tokenize(s2)
        len1 = len(tokens1)
        len2 = len(tokens2)

        maxlen = len1
        if maxlen < len2:
            maxlen = len2
        r1 = list(range(maxlen))
        r2 = [0 for _ in range(maxlen)]
        if maxlen == len1:
            for i, v in enumerate(tokens2):
                if v in tokens1:
                    r2[i] = tokens1.index(v) + 1
                else:
                    r2[i] = i
        else:
            for i, v in enumerate(tokens1):
                if v in tokens2:
                    r2[i] = tokens2.index(v) + 1
                else:
                    r2[i] = i
        word_ord_similarity = np.linalg.norm(np.array(r1) - np.array(
            r2)) / np.linalg.norm(np.array(r1) + np.array(r2))
        delta = 0.8  # set delta for convex combination of semantic similarity and word order similarity

    return delta * sem_similarity + (1 - delta) * word_ord_similarity
Example #2
0
def wordnetScore(sentence, namedEntities):

    nouns = ['NN', 'NNP', 'NNS', 'NNPS']
    verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    adjectives = ['JJ', 'JJR', 'JJS']
    adverbs = ['RB', 'RBR', 'RBS']

    #removing tokens other than nouns, verbs, adverbs, adjectives
    posToConsider = nouns + verbs + adverbs + adjectives
    pos_tokens = nltk.pos_tag(word_tokenize(sentence))
    tokensToConsider = []
    for token in pos_tokens:
        if token[1] in posToConsider and (token[0] not in namedEntities):
            tokensToConsider.append(token)

    t0_ = time.time()
    #1.word sense disambiguation
    senses = {
        token[0]: token[1]
        for token in disambiguate(sentence) if type(token[1]) != type(None)
    }
    #2.similarity calculation
    t1_ = time.time()

    for word, sense in senses.items():
        senses[word] = {}
        senses[word]['sense'] = sense
        calculateSimilarity(senses[word])
    print(senses)
    print(time.time() - t1_, t1_ - t0_)
Example #3
0
def GetDisambiguation(tweet_sentence):
    cleaned_tweet = p.clean(tweet_sentence)
    replaced_tweet = replace_word(cleaned_tweet)

    replaced_tweet_list = replaced_tweet.split(" ")

    ## Can replace this by using other WSD options (different Lesk algorithms / similarity options)

    da_token_pair_list = disambiguate(replaced_tweet,
                                      max_similarity,
                                      similarity_option='res')
    # da_token_pair_list = disambiguate(replaced_tweet, cosine_lesk)

    da_token_list = []
    for pair in da_token_pair_list:
        da_token_list.append(pair[0])

    token_negation_pair_list = negate(da_token_list)

    print(da_token_pair_list)
    print(len(da_token_pair_list))
    print(token_negation_pair_list)
    print(len(token_negation_pair_list))

    return da_token_pair_list, token_negation_pair_list
def process_jokes(joke_data, ref_dict):
    '''
    Return a DataFrame that contrains sentences along with citations and information of detected heteronyms
    '''

    sense_list = list(ref_dict['sense'])
    het_list = set(ref_dict['word'])
    word_duplicate_sense = set(ref_dict[ref_dict.duplicated(['sense'
                                                             ])]['word'])

    joke_sents = pd.DataFrame(columns=['sentence', 'citation', 'heteronym'])

    for i, row in joke_data.iterrows():
        ## List of sentence in 1 joke. Sometimes jokes do not have proper punctuation.
        ## We may see 2-3 sentence in 1 row in the out result if it fails to decompose text to separate sentences.
        sents = nltk.sent_tokenize(row['sentence'])

        for sent in sents:

            het_in_row = []
            text_token = [
                w.lower() for w in nltk.word_tokenize(sent)
                if (w not in string.punctuation) and (w.lower() not in stopset)
            ]
            het_occur = set(text_token).intersection(het_list)

            if not het_occur:
                continue

            for (word, synset) in disambiguate(sent):
                if (word in het_list) and (synset) and (synset in sense_list):

                    ## Take care of sense-duplcated heteronyms (rare),
                    ## e.g. project and projects can have same sense but different pronunciations.
                    if word.lower() in word_duplicate_sense:
                        pron = list(ref_dict[(ref_dict['word'] == word.lower())
                                             & (ref_dict['sense'] == synset)]
                                    ['pronunciation'])
                        if pron:
                            het_in_row.append((word.lower(), synset, pron[0]))

                    ## If sense if not duplicated, mapping to pron is one-to-one
                    else:
                        pron = list(ref_dict[ref_dict['sense'] == synset]
                                    ['pronunciation'])[0]
                        word_in_ref = list(
                            ref_dict[ref_dict['sense'] == synset]['word'])[0]
                        if word.lower() == word_in_ref:
                            het_in_row.append((word_in_ref, synset, pron))

            if het_in_row:
                new_row = {
                    'sentence': sent,
                    'citation': row['citation'],
                    'heteronym': het_in_row
                }
                joke_sents = joke_sents.append(new_row, ignore_index=True)

    return joke_sents
Example #5
0
def sentido_PYWSD(palabra, frase):
    frase_wsd = disambiguate(frase)
    for r in frase_wsd:
        if r[0] == palabra:
            if not (r[1] is None):
                sentido = r[1]
                return (sentido, True)
    return (None, False)
def get_disambiguated_synonym(sentence):
    for word_dis_sense in disambiguate(sentence,
                                       algorithm=maxsim,
                                       similarity_option='wup',
                                       keepLemmas=True):
        if (word_dis_sense[2] is not None):
            for lemma in word_dis_sense[2].lemma_names():
                yield (word_dis_sense[0], lemma)
def classify(text, category, dictionary, total = 0): #category = "senses" or "token"
    # seprate the text into tokens 
    if category == "sense":
    	tokens = [word[1].name() for word in disambiguate(text) if word[1] is not None]
    else:
    	tokens = word_tokenize(text)
    	tokens = [word for word in tokens if word not in stop]
    	tokens = filter(lambda word: word not in [',', '.', '!', '?', '``', "'ve", "''", "n't", "'s"], tokens)
    return probability(tokens, category, dictionary, total)
def train_random_senses_set(title):
	global random_senses_dict
	# extract the words that are not none with their synset
	senses = [word[1].name() for word in disambiguate(str(title)) if word[1] is not None]
	for sense in senses:
		if sense in random_senses_dict:
			random_senses_dict[sense] += 1
		else:
			random_senses_dict[sense] = 1
	return random_senses_dict
Example #9
0
 def replace(self, sentence):
     wording = []
     wording.append(
         disambiguate(sentence,
                      algorithm=maxsim,
                      similarity_option='wup',
                      keepLemmas=True))
     for i in wording:
         for j in i:
             if j is not None:
                 j = j.replace(wordnet.synsets(j).name)
     return wording
Example #10
0
def wsd(request):
    if request.method == 'POST':
        input_sentence = request.POST['input_sentence']
        output = disambiguate(input_sentence,
                              algorithm=maxsim,
                              similarity_option='wup',
                              keepLemmas=True)
        output = [(record[0],
                   record[2].lexname()) if record[2] is not None else
                  (record[0], None) for record in output]
        return render(request, 'output_wsd.html', {'output': output})
    return render(request, 'form_wsd.html', {})
Example #11
0
def disambiguate_pipe(df, name=None):
    """Returns a list of 2-tuples (s1_disam, s2_disam), for each sentence pair in the 
    dataframe, where each tuple is a list of disambiguated 2-tuples (word, synset).
    
    Args:
        df: the source dataframe with columns: [s1, s2].
        name ([type], optional): the name of the dataframe. Defaults to None.
    
    Returns:
        list: a list of the disambiguated sentence pairs like:
            [
                (
                    tuple[0], for s1
                    [
                        (word:str, wn_synset),
                        (word:str, wn_synset),
                        ...
                    ],
                    tuple[1], for s2
                    [
                        (word:str, wn_synset),
                        (word:str, wn_synset),
                        ...
                    ]
                ),
                ...
            ]
    """
    from pywsd import disambiguate, max_similarity
    from pywsd.lesk import adapted_lesk

    print(f"Disambiguating {name}...")
    disambiguated = []
    for s1, s2, in zip(df["s1"], df["s2"]):
        s1_disam = disambiguate(s1, adapted_lesk, prefersNone=True)
        s2_disam = disambiguate(s2, adapted_lesk, prefersNone=True)
        disambiguated.append((s1_disam, s2_disam))

    return disambiguated
Example #12
0
 def get_expanded_query(self, q, args=None):
     res = []
     disamb = disambiguate(q)
     for i, t in enumerate(disamb):
         if t[1] is not None:
             if not self.replace:
                 res.append(t[0])
             x = t[1].name().split('.')[0].split('_')
             if t[0].lower() != (' '.join(x)).lower() or self.replace:
                 res.append(' '.join(x))
         else:
             res.append(t[0])
     return ' '.join(res)
    def produce_single_repr(
            self, field_data: Union[List[str], str]) -> FeaturesBagField:
        """
        Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the
        field data text
        """

        field_data = check_not_tokenized(field_data)

        synsets = disambiguate(field_data)
        synsets = [synset for word, synset in synsets if synset is not None]

        return FeaturesBagField(Counter(synsets))
Example #14
0
def getSimilarity(s1, s2):

    s1_wsd = disambiguate(s1)  # using default disambiguation
    s2_wsd = disambiguate(s2)

    # remove None synsets
    s1_wsd = [syn for syn in s1_wsd if syn[1]]
    s2_wsd = [syn for syn in s2_wsd if syn[1]]

    #vector_length = max(len(s1_wsd), len(s2_wsd))

    L1, L2 = _synset_similarity(s1_wsd, s2_wsd)
    V1 = np.array([max(L1[key]) for key in L1.keys()])
    V2 = np.array([max(L2[key]) for key in L2.keys()])
    S = np.linalg.norm(V1) * np.linalg.norm(V2)
    C1 = sum(V1 >= benchmark_similarity)
    C2 = sum(V2 >= benchmark_similarity)

    Xi = (C1 + C2) / gamma

    if C1 + C2 == 0:
        Xi = max(V1.size, V2.size) / 2
    return S / Xi
Example #15
0
    def produce_content(self, field_representation_name: str,
                        field_data) -> FeaturesBagField:
        """
        Produces a bag of features whose key is a wordnet synset
        and whose value is the frequency of the synset in the
        field data text
        """

        field_data = check_not_tokenized(field_data)

        synsets = disambiguate(field_data)
        synsets = [synset for word, synset in synsets if synset is not None]

        return FeaturesBagField(field_representation_name, Counter(synsets))
Example #16
0
    def wsd(self, sentence):
        result = disambiguate(sentence)

        list_item = []

        for item in result:
            item = {
                "word": item[0],
                "synset": str(item[1]).replace("Synset('",
                                               "").replace("')", "")
            }

            list_item.append(item)

        return list_item
def word_sense_disambiguate(query):
    query = query[0]
    print(query)
    res = []
    disamb = disambiguate(query)
    print(disamb)
    for i, t in enumerate(disamb):
        print((i, t))
        if t[1] is not None:
            res.append(t[0])
            x = t[1].name().split('.')
            y = x[0].split('_')
            if t[0].lower() != (' '.join(y)).lower():
                res = res + y
        else:
            res.append(t[0])
#     print(' '.join(res))
    return ' '.join(res)
def translate(text, year):
    parsing = disambiguate(text)

    translated = []
    for word in parsing:
        if word[-1] is not None:
            #synms = getNgramsWrapper(','.join(word[-1].lemma_names()), year)
            print ','.join(word[-1].lemma_names())
            print
            synms = {"hi":"bye"}
            maxWord = synms.keys()[0]
            max = synms.values()[0]
            for synm in synms:
                if synms[synm] > max:
                    max = synms[synm]
                    maxWord = synm
            translated.append(maxWord)
        else:
            translated.append(word[0])
    return " ".join(translated)
Example #19
0
def main(args):
    file_path = args.file_path
    sentences = MySentences(file_path)
    # for i in sentences:
    #     print(i)
    with open(args.pwd_file_path, 'w', encoding='utf-8') as f:
        for i in sentences:
            if len(i) > 5:
                ls = []
                try:
                    for word_sen in disambiguate(' '.join(i),
                                                 algorithm=cosine_lesk):
                        if word_sen[1] is None:
                            ls.append(word_sen[0])
                        else:
                            ls.append(word_sen[0] + '|' + word_sen[1].name())
                    f.write(' '.join(ls))
                    f.write('\n')
                except:
                    print(' '.join(i))
def translate(text, year):
    parsing = disambiguate(text)

    translated = []
    for word in parsing:
        if word[-1] is not None:
            #synms = getNgramsWrapper(','.join(word[-1].lemma_names()), year)
            print ','.join(word[-1].lemma_names())
            print
            synms = {"hi": "bye"}
            maxWord = synms.keys()[0]
            max = synms.values()[0]
            for synm in synms:
                if synms[synm] > max:
                    max = synms[synm]
                    maxWord = synm
            translated.append(maxWord)
        else:
            translated.append(word[0])
    return " ".join(translated)
Example #21
0
def extract_global_bag_of_synsets(commentList):
    corpus = []
    global_synset_set = set()
    
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    
    # ISSUE throws away named entities
    i = 0
    for art in commentList.items():        
        for comm in art[1]:
            filtered_words = []
            for sentence in sent_detector.tokenize(comm.body.strip()):
                #print sentence       
                dis = disambiguate(sentence, algorithm=maxsim, similarity_option='wup')
                for w in dis:
                    # Only found words and nouns+verbs
                    if w[1] is None:
                        continue  
                    
                    if not w[1].pos() == wn.NOUN and not w[1].pos() == wn.VERB:
                        continue
                               
                    #print w[0] ," - ", w[1], " - ", w[1].definition()
                      
                    filtered_words.append(w[1])
                    global_synset_set.add(w[1])
                 
            corpus.append(filtered_words)
            i += 1
            print i
            if i % 1000 == 0:
                print i, "processed"
                break
        if i % 1000 == 0:
            print i, "processed"
            break
            
    return global_synset_set, corpus
Example #22
0
    def tokenize_corpus(self, corpus, tokenize=True):
        """
        Method that tokenizes the corpus prior to training. For each word in the corpus we compute the sense of that
        word and change it with word_sense. For example: cat can become cat_n.01
        :param corpus: the corpus as a string.
        :param tokenize: True if should tokenize the corpus beforehand.
        :return: the tokenized corpus.
        """
        # convert the corpus to be sentence
        corpus = [' '.join(sentence) for sentence in corpus]
        if not tokenize:
            return corpus

        print('Starting to tag corpus')
        corpus_tags = []
        counter = 0.0
        for sentence in corpus:
            if (counter % 100000) == 0:
                print(counter/len(corpus)*100, " percent complete         \r",)
            try:
                # get the sense of each word in the sentence
                tagged_sentence = disambiguate(sentence)
                corpus_tags.append(tagged_sentence)
            except IndexError:
                print("pywsd can't handle the sentence: " + sentence)
            counter += 1
        # create a dictionary of each word and all the senses it was mapped to
        for sentence in corpus_tags:
            for tag in sentence:
                if tag[1] is None:
                    continue
                cur_set = self.token_dict.get(tag[0], set())
                cur_set.add(tag[1].name())
                self.token_dict[tag[0]] = cur_set
        # create the tagged corpus in a format ready for training
        tagged_corpus = [[word[1].name() for word in sentence if word[1] is not None] for sentence in corpus_tags]
        return tagged_corpus
Example #23
0
async def extract_wsd(request, target):
    """
    $ curl -d '{"sents":"The sheet is twenty centimeters."}' \
        -H "Content-Type: application/json" -X POST \
        localhost:1700/en/wsd/default | json

    :param request:
    :return:
    """
    from pywsd import disambiguate
    from pywsd.similarity import max_similarity as maxsim
    from pywsd.lesk import simple_lesk

    rd = request.json
    sents = rd['sents']

    extract_syn = lambda r: (r[0], r[1].name(), r[1].definition())

    def extract_sents():
        rs = disambiguate(sents)
        return [extract_syn(r) for r in rs if r[1]]

    fn_map = {
        'default':
        lambda: extract_sents(),
        'maxsim':
        lambda: [
            extract_syn(r) for r in disambiguate(sents,
                                                 algorithm=maxsim,
                                                 similarity_option='wup',
                                                 keepLemmas=False) if r[1]
        ],
        'lesk':
        lambda: simple_lesk(sents, rd['word']),
    }
    result = fn_map[target]() if target in fn_map else []
    return json(result)
Example #24
0
 def disambiguateWordSenses2(self,sentence):           #disambiguation without simple_lesk
     synsets = disambiguate(sentence)
     print synsets
     #print synsets
     lst=[]
     for word in synsets:
         if word[1]:
             pos=word[1].pos()
             offset=word[1].offset()
             print "$$$$$$$$$$$$$$$$"
             print word[1], pos,offset
             pos_score=0.0
             neg_score=0.0
             if(pos,offset) in self.db:
                 pos_score,neg_score = self.db[(pos,offset)]
                 #print word[0],pos_score,neg_score
             obj = 1.0-(pos_score+neg_score)
         else:
             pos = None
             obj=1.0
             pos_score=0.0
             neg_score=0.0
         lst.append((word[0],obj,pos,pos_score,neg_score))
     return lst
Example #25
0
def translate(text, year):
    parsing = disambiguate(text)
    print parsing

    translated = []
    for word in parsing:
        if word[-1] is not None:
            print word[-1].lemma_names()
            synms = word[-1].lemma_names()
            maxWord = synms[0]
            max = 0
            for synm in synms:
                freq_count = get_freq(str(synm), year)
                if freq_count > max:
                    max = freq_count
                    maxWord = synm
            translated.append(maxWord)
        else:
            translated.append(word[0])
    print 'input text: ', text
    print
    print "final result: ", ' '.join(translated)
    print
    return ' '.join(translated)
Example #26
0
def wordSense(text):
    # I will use a dictionary structure to store the data and transfer it into pandas dataframe
    dic = {}
    word = []
    sense = []
    definition = []
    
    # apply function
    answer = disambiguate(text)
    
    # iterate each elements from disambiguate()
    for t in answer:
        if all(t):
            word.append(t[0])
            sense.append(t[1])
            definition.append(t[1].definition())
            
    # store in list and define a dic
    dic['word'] = word
    dic['sense'] = sense
    dic['definition'] = definition
    
    # return as dataframe
    return DataFrame(dic)
def translate(text, year):
    parsing = disambiguate(text)
    print parsing

    translated = []
    for word in parsing:
        if word[-1] is not None:
            print word[-1].lemma_names()
            synms = word[-1].lemma_names()
            maxWord = synms[0]
            max = 0
            for synm in synms:
                freq_count = get_freq(str(synm), year)
                if freq_count > max:
                    max = freq_count
                    maxWord = synm
            translated.append(maxWord)
        else:
            translated.append(word[0])
    print 'input text: ', text
    print
    print "final result: ", ' '.join(translated)
    print
    return ' '.join(translated)
Example #28
0
def thesaurus_expand(query, wikimedia, size=3, threshold=4.23):
    """
    Wordent hierarchy
     - hyponyms concepts that are more specific (immediate), navigate down to the tree
     - hypernyms general concept, navigate up the hierarchy
     - meronyms components. For instance a tree have trunk, ...so on as meronym
     - holonyms things that contain meronyms (i.e. tree)

     Query expansion require good relevance feedback methods. Using a thesaurus based query expansion might decrease
     performance and has query drift problems with polysemic words. This method picks up keyword from gloss of the synsets
     and uses a lesk algorithm to disambiguate terms from each other
    :param query:
    :return:
    """
    analyzer = ThesaurusExpansionAnalyzer()
    wikimedia_analyzer = WikimediaAnalyzer()
    original_tokens = [i.text for i in analyzer(query)]
    # original_tokens = set([i.text for i in query.all_tokens()])
    print(original_tokens)

    synonyms = set()

    rule = r"""
           NBAR: {<NN>}
                 {<JJ>}
                 # {<JJS>}
                 {<NNS>}
                 # {<NNP>}
    """

    synsets = []
    # for i in original_tokens:
    #     for s in wordnet.synsets(i):
    #         for h in s.hypernyms():
    #             print(s, h , s.wup_similarity(h))

    # for i in original_tokens:
    #     for s in wordnet.synsets(i):
    #         print(s.definition())

    for w, s in disambiguate(" ".join(original_tokens), algorithm=adapted_lesk):
        if s:
            definition = s.definition()
            pke_text = definition + ' ' + ' '.join(s.lemma_names())
            # print(pke_key_phrase_extract(pke_text))
            tokens = [i.text for i in wikimedia_analyzer(definition)]
            synsets.append((w, wordnet.synset(s.name()), tokens))

    for word, sense, definition in synsets:
        if sense:
            synonyms = synonyms.union(noun_groups(word_tokenize(sense.definition()), chunk_size=1, rule=rule))
            text = " ".join([i.name() for i in sense.lemmas()])
            for lemma in wikimedia_analyzer(text):
                if lemma.text not in original_tokens:
                    synonyms.add(lemma.text)
                # vfor tok in wikimedia_analyzer(lemma.text):
                #     print(tok.text)
                #     if tok.text not in original_tokens:
                #         synonyms.add(tok.text)

    # for token in tokens: for _, original_sense, _ in synsets: for child_synset in wordnet.synsets(token):
    # if child_synset: # definition = [i.text for i in analyzer(child_synset.definition())] # pywsd. score =
    # wordnet.synset(original_sense.name()).path_similarity(child_synset, simulate_root=False) print(
    # child_synset, child_synset.definition(), original_sense, score)

    # print(tokens)
    # print([j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.simple_lesk)], '\n',
    #       [j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.adapted_lesk)], '\n',
    #       [j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.cosine_lesk)], '\n',
    #       [j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.max_similarity)])

    # if len(_concept) > 0:
    #     concept, similarity_strength = _concept[0]
    #     if similarity_strength > 0.7:
    #         __retrieve_definition_groupings(synsets)
    # else:
    #     print(__retrieve_definition_groupings(synsets))
    # disambiguated_senses = disambiguate(query, algorithm=adapted_lesk)

    # print(disambiguated_senses, '\n\n', simple_lesk, '\n\n', resnik_wsd(word_tokenize(query)), '\n')
    # for token in original_tokens:
    #     senses = wordnet.synsets(token, 'n')
    #     if len(senses) == 1:
    #         synonyms = synonyms.union(set(senses[0].lemma_names()))
    #     else:
    #
    # tokens += [i.text for i in analyzer(' '.join(list(synonyms)))]
    # return original_tokens + [i for i in tokens if i not in original_tokens]

    reader = wikimedia.reader

    terms_vec = {}
    for syn in synonyms:
        score = calc_syn_score(syn, reader)

        terms_vec[syn] = score
        # else:
        #     terms_vec[syn] = 0

    ranked_terms = sorted(terms_vec, key=lambda c: terms_vec[c], reverse=True)
    print('***Ranked terms')
    for i in list(map(lambda q: (q, terms_vec[q]), ranked_terms)):
        print(i[0], ' ', i[1], '\n')

    return list(map(lambda q: q[0], filter(lambda v: v[1] >= threshold, terms_vec.items())))
Example #29
0
from pywsd.lesk import adapted_lesk
from pywsd.lesk import cosine_lesk
from pywsd import disambiguate
from pywsd.similarity import max_similarity as maxsim
from pywsd.utils import has_synset

simplelesk_answer = []
adaptedlesk_answer = []
cosinelesk_answer = []

print "\nSentence Context Disambiguation\n============================== \n"

raw_sentence="Some people are happy this sentence."
words = nltk.word_tokenize(raw_sentence)
print "\nChecking synsets of each word . . .\n==========================================\n"
print(disambiguate(raw_sentence))
print "\nDisambiguating your sentence word by word using Simple Lesk algorithm. Hold on. \n======================================================"
for eachword in words:
    if has_synset(eachword):
        answer = simple_lesk(raw_sentence, eachword)
        simplelesk_answer.append(answer)
        print "Sense :", answer
        print eachword+":"+answer.definition()+"\n"
    else:
        print eachword+": "+eachword+"\n"    
        simplelesk_answer.append(eachword)
""""
        
print "\nDisambiguating your sentence word by word using Adapted Lesk algorithm. Hold on. \n======================================================"

for eachword in words:
def get_disambiguated_synonym(sentence):
    for word_dis_sense in disambiguate(sentence, algorithm=maxsim, similarity_option='wup', keepLemmas=True):
        if(word_dis_sense[2] is not None):
            for lemma in word_dis_sense[2].lemma_names():
                yield(word_dis_sense[0],lemma)
            AcText[act]):  # for each actorgs untikenized text in txtuntkn;
        #         txtuntkn=AcText[act][0]
        #         print(txtuntkn['texte'])
        txttkn = tokenize.sent_tokenize(txtuntkn['texte'])
        #         print(txttkn)
        #         print(txttkn)# tokenized text
        for tmp3, sent in enumerate(
                txttkn
        ):  # for each sentenxe in tokenzed text of eaxh actor in sent
            #             sent=txttkn[0]
            # wordlist=sent.split()
            # for wrd in wordlist:
            try:
                # if counter <2:
                dissent = disambiguate(sent,
                                       algorithm=maxsim,
                                       similarity_option='wup',
                                       keepLemmas=True)
                counter = counter + 1
                print(counter)
                print(dissent)
                for tmp4, diswrd in enumerate(dissent):
                    print(diswrd)
                    if diswrd[2] is not None:
                        diswrd_brief = (diswrd[0], diswrd[1], diswrd[2].name())
                        txtuntkn['Textwsd'].append(diswrd_brief)
            except:
                pass
        #             sent=sent.replace('\n',' ')

#             print(tmp3)
#             print(disambiguate(sent, algorithm=maxsim, similarity_option='wup', keepLemmas=True))
Example #32
0
from pywsd import disambiguate
from nltk import sent_tokenize
text = "Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java. The language provides constructs intended to enable clear programs on both a small and large scale. Python supports multiple programming paradigms, including object-oriented, imperative and functional programming or procedural styles."
for sent in sent_tokenize(text):
    print disambiguate(sent, prefersNone=True)
Example #33
0
def nominalized_verb_detection(docID, doc, sent):
    sentences = tokenize.sent_tokenize(sent)
    result = []
    result1 = []
    true_word = []
    false_word = []
    # word count for the sentence
    word_count = []
    # number of nominalization in the sentence
    nomi_count = []
    sen_id = -1
    # to print the sentence in output
    sentence = []
    # to print the nominalizations in each sentence
    nomi_sen = []
    nomi_sen_ = ""

    def is_pos(s, pos):
        # print(s)
        return s.split('.')[1] == pos

    for each_sen in sentences:
        sen_id += 1
        nomi_count.append(0)
        word_count.append(0)
        sentence.append(each_sen)
        words_with_tags = disambiguate(each_sen)
        for tup in words_with_tags:
            word, syns = tup
            if (word in string.punctuation) or (word == "\"") or (
                    word[0] == "\'") or (word[0] == "`"):
                continue
            word_count[sen_id] += 1
            derivationals = []
            word = word.lower()
            if word in true_word:
                nomi_count[sen_id] += 1
                if nomi_sen_ == "":
                    nomi_sen_ = word
                else:
                    nomi_sen_ = nomi_sen_ + "; " + word
                noun_cnt[word] += 1
                nominalized_cnt[word] += 1
                continue
            if word in false_word:
                noun_cnt[word] += 1
                continue
            if syns:
                #look at only nouns
                if not is_pos(syns.name(), 'n'):
                    result.append([word, False])
                    false_word.append(word)
                    noun_cnt[word] += 1
                    continue
                if wn.lemmas(word):
                    for lemma in wn.lemmas(word):
                        derive = lemma.derivationally_related_forms()
                        if derive not in derivationals and derive:
                            derivationals.append(derive)
                else:
                    try:
                        derivationals = syns.lemmas(
                        )[0].derivationally_related_forms()
                    except:
                        pass
                stem = first_section.match(str(syns.name())).group(1)
                found = False
                for deriv in derivationals:
                    if is_pos(str(deriv), 'v'):
                        deriv_str = str(deriv)[7:-3].split('.')[3]
                        if len(word) <= len(deriv_str):
                            continue
                        result.append([word, True])
                        true_word.append(word)
                        noun_cnt[word] += 1
                        if nomi_sen_ == "":
                            nomi_sen_ = word
                        else:
                            nomi_sen_ = nomi_sen_ + "; " + word
                        nominalized_cnt[word] += 1
                        found = True
                        break
                if found:
                    nomi_count[sen_id] += 1
                    continue
                else:
                    result.append([word,
                                   False])  #includes word='NO NOMINALIZATION'
                    noun_cnt[word] += 1
        nomi_sen.append(nomi_sen_)
        nomi_sen_ = ""
    for i in range(sen_id + 1):
        #['Document ID', 'Document', 'Sentence ID', 'Sentence', 'Number of words in sentence', 'Nominalized verbs','Number of nominalizations in sentence', 'Percentage of nominalizations in sentence'])
        if word_count[i] > 0:
            result1.append([
                docID,
                IO_csv_util.dressFilenameForCSVHyperlink(doc), i + 1,
                sentence[i], word_count[i], nomi_sen[i], nomi_count[i],
                100.0 * nomi_count[i] / word_count[i]
            ])
        else:
            result1.append([
                docID,
                IO_csv_util.dressFilenameForCSVHyperlink(doc), i + 1,
                sentence[i], word_count[i], nomi_sen[i], nomi_count[i]
            ])
    # print(result1)
    # result contains a list of each word TRUE/FALSE values for nominalization
    # result1 contains a list of docID, docName, sentence...
    return result, result1
Example #34
0
def text_to_emoji(text):
    tokens = disambiguate(text, algorithm=max_similarity, 
        similarity_option='wup', keepLemmas=True)
    return " ".join([token_to_emoji(t) for t in tokens])
Example #35
0
 def disambiguate_max(self, sentence):
     from pywsd.similarity import max_similarity as maxsim
     return disambiguate(sentence,
                         algorithm=maxsim,
                         similarity_option='wup',
                         keepLemmas=True)
Example #36
0
def compare_defin(defin1, defin2):
    return len([item for item in set(disambiguate(defin1)).intersection(disambiguate(defin2)) if filter_stopwords(item)])
Example #37
0
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pywsd

#p = input()
#p = 'I was walking around a river bank while looking at fishes in the water.'
p = 'I am going to bank to deposit my money'
new = [
    word for word in word_tokenize(p)
    if word.lower() not in stopwords.words('English')
    and word not in string.punctuation
]
new = ' '.join(new)
for word, context in pywsd.disambiguate(new):
    print(word.ljust(15, ' '), ':  ',
          context.definition() if context else 'Not Found')
        token_pair_list.append((word, negation))
        org_tokens.append(word)

    return org_tokens, token_pair_list


## get all definition from all synsets

synset_list = list(wn.all_synsets())

all_pairs_from_definition = []

# for ss in tqdm([wn.synset('amazing.s.02'), wn.synset('good.a.01')]):
for ss in tqdm(synset_list):
    df = ss.definition()
    curr_df_pair_list = disambiguate(df, cosine_lesk)
    df_pair_txt_list = []
    for curr_df_pair in curr_df_pair_list:
        if curr_df_pair[1] is None:
            df_pair_txt_list.append(curr_df_pair)
        else:
            df_pair_txt_list.append((curr_df_pair[0], curr_df_pair[1].name()))
    all_pairs_from_definition.append((ss.name(), df_pair_txt_list))

with open('all_wn_synset_definition_da_cosine.txt', 'w') as fp:
    fp.write(all_pairs_from_definition)

pickle.dump(all_pairs_from_definition,
            open("all_wn_synset_definition_da_cosine.p", "wb"))

# ## Prepare graphical model
Example #39
0
def lemmatized_pos_selected_overlap(question, story):
    q = question["question"]

    punct = set(string.punctuation)

    q_sent_token = nltk.sent_tokenize(q)
    q_word_token = [nltk.word_tokenize(word) for word in q_sent_token]
    q_word_tagged = [nltk.pos_tag(word) for word in q_word_token]
    stop_words = nltk.corpus.stopwords.words("english")
    key_question_words = set([(tup[0].lower(),tup[1]) for ls in q_word_tagged for tup in ls if tup[0].lower() not in stop_words \
         and tup[0].lower() not in punct and not '\'' in tup[0] ])
    #print(key_question_words)
    #print(q_word_tagged)

    sentence_id_sent = [(dic["sentenceid"],
                         nltk.sent_tokenize(dic["sentence"])) for dic in story]
    #print(sentence_id_sent)
    #story_sentences = list(map(lambda x: nltk.sent_tokenize(x), [dic["sentence"] for dic in story] ))
    #print(story_sentences)
    #sentences = [nltk.word_tokenize(word) for sent in story_sentences for word in sent ]
    #sentences_tagged = [nltk.pos_tag(ls) for ls in sentences]
    sentences = [(tup[0], nltk.word_tokenize(word), tup[1])
                 for tup in sentence_id_sent for word in tup[1]]
    sentences_tagged = [(tup[0], nltk.pos_tag(tup[1]), tup[2])
                        for tup in sentences]
    lemmatizer = WordNetLemmatizer()

    pos_match = {"NN": 'n', "JJ": 'a', "VB": 'v', "RB": 'r'}
    pos_match.setdefault('n')
    #key_question_words = set(map(lambda w: lemmatizer.lemmatize(w[0],pos=pos_match.get(re.match('^(..?)\w*',w[1]).group(0),'n')),key_question_words))
    stemmer = SnowballStemmer('english')
    key_question_words = set(
        map(lambda w: stemmer.stem(w[0]), key_question_words))
    #key_question_w_posDict = {}
    #for stem,pos in key_question_words:
    #        key_question_w_posDict[stem] = pos

    #key_question_words = set(map(lambda w: w[0],key_question_words))

    question_word = get_question_word(q)
    q_disambiguated = disambiguate(q)

    set_q_synsets = set(map(lambda w: w[1], q_disambiguated))

    set_q_synsets.remove(None)

    answers = []
    for sent in sentences_tagged:
        key_sentence_words = set([ (tup[0].lower(),tup[1]) for tup in sent[1] if tup[0].lower() not in stop_words \
              and tup[0].lower() not in punct and not '\'' in tup[0] ])
        #key_sentence_words = set(map(lambda w: lemmatizer.lemmatize(w[0],pos=pos_match.get(re.match('^(..?)\w*',w[1]).group(0),'n')),key_sentence_words))
        key_sentence_words = set(
            map(lambda w: (stemmer.stem(w[0]), w[1]), key_sentence_words))
        key_sentence_w_posDict = {}

        for stem, pos in key_sentence_words:
            key_sentence_w_posDict[stem] = pos

        sen_disambiguated = disambiguate(sent[2][0])
        set_sen_synsets = set(map(lambda w: w[1], sen_disambiguated))
        set_sen_synsets.remove(None)
        #set_sen_synsets = set(map(lambda w: re.match()))
        # print(disambiguate('Where did Andrew and his dad go'))
        # for word, syn in sen_disambiguated:
        #     if syn is not None:
        #         print(syn)
        #         print (wn.synset('circus.n.05').definition())
        #         print (wn.synset('circus.n.05')._lexname)
        #         print(syn._pos)
        #         print(syn._lemmas)
        #         synRe = re.match('(\w+)\.\(w+)\.(\w+)',syn)

        key_sentence_words = set(map(lambda w: w[0], key_sentence_words))

        overlap = 0
        overlapList = (key_question_words & key_sentence_words)
        for word in overlapList:
            if "nn" in key_sentence_w_posDict[word].lower():
                overlap += .5
            elif "vb" in key_sentence_w_posDict[word].lower():
                overlap += 2.5
            elif "rb" in key_sentence_w_posDict[word].lower():
                overlap += .25
            else:
                overlap += 1

        synsetOverlap = (set_q_synsets & set_sen_synsets)
        synsetOverlap = set(filter(lambda q: q is not None, synsetOverlap))
        overlap += len(synsetOverlap)

        # print(key_sentence_words)

        answers.append((overlap, (sent[0], key_sentence_words)))

    answers = sorted(answers, key=operator.itemgetter(0), reverse=True)
    best_answer = (answers[0])[1]
    #print("answer:", " ".join(tup[0] for tup in best_answer))
    #print(best_answer)

    # if question_word == 'why':
    #     bestAnswerIndex = 0
    #     if (answers[0][0]/len(answers[0][1]) >= .9):
    #         for i in range(len(sentence_id_sent)):
    #             if sentence_id_sent[i][0] == best_answer[0]:
    #                 bestAnswerIndex = i
    #         if bestAnswerIndex < len(sentence_id_sent):
    #             # print (sentence_id_sent[i][0])

    #             return sentence_id_sent[i][0]

    answerid = best_answer[0]
    return answerid  #, " ".join(tup[0] for tup in best_answer[1])
                                ):
                                    if example == examplelist:
                                        context_syn.append(synsetelem)
                        #meaning specified -> map meaning to pronunciation
                        classified_pronun = map_meaning2pronun(
                            context_syn, targetword, targettag)
                        result_hetero.setdefault(targetword, [])
                        result_hetero[targetword].append(classified_pronun)

                    #noun case
                    elif 'NN' in targettag:
                        result_syn = []
                        hetero_num_in_sentence_noun += 1
                        iteration = 0
                        # pywsd used
                        wsdlist = disambiguate(sentence)
                        for wsdentry in wsdlist:
                            if wsdentry[0].lower(
                            ) == targetword and wsdentry[1] is not None:
                                if wsdentry[1].pos() == 'n':
                                    iteration += 1
                                    if hetero_num_in_sentence_noun == iteration:
                                        result_syn.append(wsdentry[1])

                        classified_pronun = map_meaning2pronun(
                            result_syn, targetword, targettag)
                        result_hetero.setdefault(targetword, [])
                        result_hetero[targetword].append(classified_pronun)

    # pattern heteronym
    if len(pos_heterolist) > 0: