Esempi in Python per stem, esempi in Python per utils.porter.stem

Esempio n. 1

0

Mostra file

File: Weighter.py Progetto: AKNOUCHEanis/Moteur_de_recherche

    def getWeightsForQuery(self, query):
        indexTf = self.index.getIndex()
        words = query.lower().split()

        words_stem = []

        for w in words:
            words_stem.append(porter.stem(w))

        resultat = dict()
        terms = []
        docs = []
        docs = indexTf.keys()

        for d in docs:
            doc = dict()
            terms = indexTf[d].keys()
            for t in terms:
                if t in words_stem:
                    doc[t] = (1 + np.log(indexTf[d][t])) * np.log(
                        (1 + len(docs) /
                         (1 + len(self.index.getTfsForStem(t).keys()))))
                else:
                    doc[t] = 0
            resultat[d] = doc
        return resultat

Esempio n. 2

0

Mostra file

File: Weighter.py Progetto: AKNOUCHEanis/Moteur_de_recherche

    def getWeightsForQuery(self, query):

        words = query.lower().split()
        words_stem = []

        for w in words:
            words_stem.append(porter.stem(w))

        counter = dict(collections.Counter(words_stem))
        resultat = dict()
        terms = []

        indexTf = self.index.getIndex()
        docs = []
        docs = indexTf.keys()
        for d in docs:
            doc = dict()
            terms = indexTf[d].keys()
            for t in terms:
                if t in words_stem:
                    doc[t] = counter[t]
                else:
                    doc[t] = 0
            resultat[d] = doc
        return resultat

Esempio n. 3

0

Mostra file

File: lesk.py Progetto: kmvishakh1992/mynewrepo

def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
   
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try: 
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        ss_definition = synset_properties(ss, 'definition')
        signature+=ss_definition
        ss_examples = synset_properties(ss, 'examples')
        signature+=list(chain(*[i.split() for i in ss_examples]))
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature+= ss_lemma_names
        
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms+ss_hyponyms
            signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
        
        if stop == True: 
            signature = [i for i in signature if i not in EN_STOPWORDS]
        if lemma == True: 
            signature = [lemmatize(i) for i in signature]
        if stem == True: 
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature
        
    return synsets_signatures

Esempio n. 4

0

Mostra file

File: IRModel.py Progetto: AKNOUCHEanis/Moteur_de_recherche

 def getScores(self,query,pertinence=None):
     """ Calcul du score basé sur Okapi-BM25
     """
     
     words=query.lower().split()
     words_stem=[]
     
     
     for w in words:
         words_stem.append(porter.stem(w))
         
     index=self.index.getIndex()
     index_inverse=self.index.getIndexInverse()
     
     idf={t:self.getIdf(t,index,index_inverse,pertinence) for t in words_stem}
     length_docs={doc:sum([int(tf) for tf in index[doc].values()]) for doc in index.keys() }
     
     mean_length_docs=np.mean([int(val) for val in length_docs.values()])
     docs=index.keys()
     
     docsScores={}
     
     for d in docs:
         score=0
         for t in words_stem:
             score+=idf[t]*index[d].get(t,0)/(index[d].get(t,0) +self.K1*(1-self.B+self.B*(length_docs[d]/mean_length_docs)))
             
         docsScores[d]=score     
         
     return docsScores

Esempio n. 5

0

Mostra file

File: lesk.py Progetto: katieclark/datasci_topiccoherence_wordnet

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(
            set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
                ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms +
                ss_simto))

        signature = list([
            j for j in chain(
                *[synset_properties(i, 'lemma_names') for i in related_senses])
            if j not in EN_STOPWORDS
        ])

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss] += signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Esempio n. 6

0

Mostra file

File: lesk.py Progetto: peterdm/pywsd

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + 
                                 ss.part_meronyms() + ss.part_holonyms() + 
                                 ss.similar_tos() + ss.substance_holonyms() + 
                                 ss.substance_meronyms()))
    
        try:
            signature = list([j for j in chain(*[i.lemma_names() for i in \
                      related_senses]) if j not in stopwords.words('english')])
        except:
            signature = list([j for j in chain(*[i.lemma_names for i in \
                      related_senses]) if j not in stopwords.words('english')])
    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    ss_sign[ss]+=signature
  
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = [porter.stem(i) for i in context_sentence.split()]
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ambiguous_word, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Esempio n. 7

0

Mostra file

File: lesk.py Progetto: shreyg/GitFiles

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')
        
        related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ 
                                  ss_sub_holonyms+ss_mem_meronyms+ 
                                  ss_part_meronyms+ss_sub_meronyms+ ss_simto))
    
        signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') 
                                             for i in related_senses]) 
                          if j not in EN_STOPWORDS])
        
    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss]+=signature
  
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Esempio n. 8

0

Mostra file

File: Index-checkpoint.py Progetto: hanouticelina/search_engine

def counter(doc):
    """
    Processing d'un document 
    Args:
        - doc : document considéré
    Return:
        - (mot,occurrence) pour chaque mot présent dans le document
    """
    l = [pt.stem(w.lower()) for w in doc.split(" ")]
    stemmer = tr.PorterStemmer()
    l = [word for word in l if word not in stemmer.stopWords]
    return dict(collections.Counter(l).items())

Esempio n. 9

0

Mostra file

File: lesk.py Progetto: peterdm/pywsd

def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """ 
    Returns a synsets_signatures dictionary that includes signature words of a 
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        # If POS is specified.
        try:
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        
        signature = []
        # Includes definition.
        try: signature+= ss.definition().split()
        except: signature+= ss.definition.split()
        # Includes examples
        try: signature+= list(chain(*[i.split() for i in ss.examples()]))
        except: signature+= list(chain(*[i.split() for i in ss.examples]))
        # Includes lemma_names.
        try: signature+= ss.lemma_names()
        except: signature+= ss.lemma_names
        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            try: signature+= list(chain(*[i.lemma_names() for i \
                                          in ss.hypernyms()+ss.hyponyms()]))
            except: signature+= list(chain(*[i.lemma_names for i \
                                             in ss.hypernyms()+ss.hyponyms()]))
        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in stopwords.words('english')]
        # Lemmatized context is preferred over stemmed context
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words causes sparsity, so optional matching for stems.
        if stem == True: 
            signature = [porter.stem(i) for i in signature]

        signature = [i.lower() for i in signature]

        synsets_signatures[ss] = signature
    
    return synsets_signatures

Esempio n. 10

0

Mostra file

File: IRModel.py Progetto: AKNOUCHEanis/Moteur_de_recherche

 def getScores(self, query):
     weighter=self.getWeighter()
     words=query.lower().split()
     
     self.norm_doc={} #Les normes des documents
     
     words_stem=[]
     
     for w in words:
         words_stem.append(porter.stem(w))
         
         
     if(self.normalized): #Methode Cosinus
         docs=weighter.getWeightsForQuery(query)
         keysDocs=docs.keys()
         
         docsScores={}
         for k in keysDocs:
             score=0
             weighterDoc=weighter.getWeightsForDoc(k)
             for stem in words_stem:
                 if stem in docs[k].keys():
                     score+=docs[k][stem]*weighterDoc[stem]
                     
             norm_vect1=np.linalg.norm(np.array(list(docs[k].values())),ord=2)
             if k in self.norm_doc.keys():
                 norm_vect2=self.norm_doc[k]
             else:
                 norm_vect2=np.linalg.norm(np.array(list(weighterDoc.values())),ord=2)
                 self.norm_doc[k]=norm_vect2
             
             docsScores[k]=score/(norm_vect1+norm_vect2)
         
         
         
     else: #Produit scalaire
         docs=weighter.getWeightsForQuery(query)
         keysDocs=docs.keys()
         
         docsScores={}
         for k in keysDocs:
             score=0
             weighterDoc=weighter.getWeightsForDoc(k)
             for stem in words_stem:
                 if stem in docs[k].keys():
                     score+=docs[k][stem]*weighterDoc[stem]
             
             docsScores[k]=score
             
     return docsScores

Esempio n. 11

0

Mostra file

File: lesk.py Progetto: lucasnoah/litmetricscore

def cosine_lesk(
    context_sentence,
    ambiguous_word,
    pos=None,
    lemma=True,
    stem=True,
    hyperhypo=True,
    stop=True,
    context_is_lemmatized=False,
    nbest=False,
):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]

Esempio n. 12

0

Mostra file

File: lesk.py Progetto: katieclark/datasci_topiccoherence_wordnet

def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """ 
    Returns a synsets_signatures dictionary that includes signature words of a 
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try:  # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature += ss_definition.split()
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature += list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature += ss_lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms + ss_hyponyms
            signature += list(
                chain(*[i.lemma_names() for i in ss_hypohypernyms]))

        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True:
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature

    return synsets_signatures

Esempio n. 13

0

Mostra file

File: lesk.py Progetto: shreyg/GitFiles

def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """ 
    Returns a synsets_signatures dictionary that includes signature words of a 
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try: # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature+=ss_definition
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature+=list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature+= ss_lemma_names
        
        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms+ss_hyponyms
            signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
        
        # Optional: removes stopwords.
        if stop == True: 
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True: 
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True: 
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature
        
    return synsets_signatures

Esempio n. 14

0

Mostra file

File: lesk.py Progetto: katieclark/datasci_topiccoherence_wordnet

def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]

Esempio n. 15

0

Mostra file

File: lesk.py Progetto: kmvishakh1992/mynewrepo

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
  
    # Ensure ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        ss_simto = synset_properties(ss, 'similar_tos')
        
        related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ 
                                  ss_sub_holonyms+ss_mem_meronyms+ 
                                  ss_part_meronyms+ss_sub_meronyms+ ss_simto))
    
        signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') 
                                             for i in related_senses]) 
                          if j not in EN_STOPWORDS])
        
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    #if stem == True:
    signature = [porter.stem(i) for i in signature]
    ss_sign[ss]+=signature
  
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Esempio n. 16

0

Mostra file

File: lesk.py Progetto: peterdm/pywsd

def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word).lower()
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = [porter.stem(i.lower()) for i in context_sentence.split()]
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ambiguous_word, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)  
    return best_sense

Esempio n. 17

0

Mostra file

File: IRModel.py Progetto: AKNOUCHEanis/Moteur_de_recherche

    def getScores(self, query):
        """

        Parameters
        ----------
        query : String
        
        Returns Les scores des documents sous forme de couple (Doc , Score) dans un dict
                Calcul du score basé sur Le lissage Jelineck-Mercer
        -------
        """
        
        words=query.lower().split()
        words_stem=[]
        
        for w in words:
            words_stem.append(porter.stem(w))
            
        index=self.index.getIndex()
        docs=index.keys()
        docsScores={}
        
        probaCollection={}
        sizeOfCollection=self.index.getCollectionSize()
        
        for t in words_stem:
            probaCollection[t]=sum([doc.get(t,0) for doc in index.values()])/sizeOfCollection
        
        
        for d in docs:
            score=1
            for t in words_stem:
                score*=(1-self.lambda_)*index[d].get(t,0) + self.lambda_*probaCollection[t]
                
            docsScores[d]=score
            
        return docsScores