コード例 #1
0
ファイル: similarity.py プロジェクト: ChenglongChen/pywsd
def max_similarity(context_sentence, ambiguous_word, option="path", 
                   lemma=True, context_is_lemmatized=False, pos=None, best=True):
    """
    Perform WSD by maximizing the sum of maximum similarity between possible 
    synsets of all words in the context sentence and the possible synsets of the 
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
    result = {}
    for i in wn.synsets(ambiguous_word):
        try:
            if pos and pos != str(i.pos()):
                continue
        except:
            if pos and pos != str(i.pos):
                continue 
        result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
                        for j in context_sentence)
    
    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)
    ##print result
    if best: return result[0][1];
    return result
コード例 #2
0
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(
            set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
                ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms +
                ss_simto))

        signature = list([
            j for j in chain(
                *[synset_properties(i, 'lemma_names') for i in related_senses])
            if j not in EN_STOPWORDS
        ])

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss] += signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #3
0
ファイル: lesk.py プロジェクト: shreyg/GitFiles
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')
        
        related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ 
                                  ss_sub_holonyms+ss_mem_meronyms+ 
                                  ss_part_meronyms+ss_sub_meronyms+ ss_simto))
    
        signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') 
                                             for i in related_senses]) 
                          if j not in EN_STOPWORDS])
        
    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss]+=signature
  
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #4
0
def get_alignment_complexity_scores(s0, s1):
    """
    Run Sultan's aligner on two sentences and return the list that for each word in the first
    sentence specifies whether it was changed/simplified (1), kept unchanged (2) or cannot be
    linked to any other word in the sentence (0).
    :param s0: the first sentence as a list of tokens
    :param s1: the second sentence as a string
    :return:   see above
    """
    s0 = [x.lower() for x in s0]
    s1 = s1.lower()

    # check if the alignment has been performed before
    dict_key = " ".join(s0) + SEPARATOR + s1
    if dict_key in ALIGN_DICT:
        return ALIGN_DICT[dict_key]

    result = np.full(len(s0), UNK)
    ALIGNMENT_STATS["total"] += 1

    try:
        # tokenize and lemmatize the sentences
        s0_tok = tokenize(" ".join(s0))
        s1_tok = tokenize(s1)
        s0_lem = lemmatize(s0_tok)
        s1_lem = lemmatize(s1_tok)
        pairs = align(s0_tok, s1_tok)  # pairs of sentences aligned by Sultan's word aligner
    except:
        ALIGN_DICT[dict_key] = result
        ALIGNMENT_STATS["unsuccessful"] += 1
        return result

    # iterate over aligned pairs and feel the result array
    for i in range(len(pairs[0])):
        w0, w1 = pairs[1][i][0].lower(), pairs[1][i][1].lower()
        if w0 in STOPWORDS or w1 in STOPWORDS:  # such an alignment doesn't matter
            continue
        if w0 == w1 or s0_lem.get(w0, 'w0') == s1_lem.get(w1, 'w1'):
            # the alignment is valid but it only indicates that the word was kept as it is
            id = get_index(s0, w0, i, pairs)
            if id == -1:
                continue
            result[id] = SIMPLE
        else:
            id = get_index(s0, w0, i, pairs)
            if id == -1:
                continue
            result[id] = COMPLEX

    ALIGN_DICT[dict_key] = result
    return result
コード例 #5
0
ファイル: lesk.py プロジェクト: lucasnoah/litmetricscore
def cosine_lesk(
    context_sentence,
    ambiguous_word,
    pos=None,
    lemma=True,
    stem=True,
    hyperhypo=True,
    stop=True,
    context_is_lemmatized=False,
    nbest=False,
):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]
コード例 #6
0
def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #7
0
ファイル: lesk.py プロジェクト: shreyg/GitFiles
def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word) 
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)  
    return best_sense
コード例 #8
0
ファイル: model.py プロジェクト: Linkor-35/other
    def _tokenize(self, text='Текст нужно передать функции в виде строки!'):
        from utils import lemmatize

        if not self.udpipe_model:
            udpipe_model_path = os.path.join(BASE_DIR, 'model',
                                             'udpipe_syntagrus.model')

            if not os.path.isfile(udpipe_model_path):
                msg = 'UDPipe model not found!'
                logging.critical(msg)
                raise IOError(msg)

            self.udpipe_model = Model.load(udpipe_model_path)

        t = time()
        process_pipeline = Pipeline(self.udpipe_model, 'tokenize',
                                    Pipeline.DEFAULT, Pipeline.DEFAULT,
                                    'conllu')

        result = []
        for line in nltk.sent_tokenize(text):
            # line = unify_sym(line.strip()) # здесь могла бы быть ваша функция очистки текста
            output = lemmatize(process_pipeline, text=line)
            result.extend(output)

        self.tagged_counter += 1
        log(f'{self.tagged_counter} of {self.tagged_max} created, for {round(time() - t, 2)}s'
            )

        return result
コード例 #9
0
ファイル: Answer.py プロジェクト: rostro36/Partisan-Responses
    def create_test(self,verb_dict, verb_list):
        self.change_comma()
        triplets = self.create_oieresult()
        return_text=""
        for sentence in triplets:
            if len(sentence)==0:
                self.parsed.append('str(len(self.phrase_corpus))+" -1"')
                continue
            text=re.sub('\[[^\s]*','',sentence[0]['description'])
            text=re.sub('\]','',text).split()
            tags=[False]*len(sentence[0]['tags'])
            for triplet in sentence:
                arg_points=[x in ['I-ARG0','B-ARG0','I-ARG1','B-ARG1'] for x in triplet['tags']]
                abort=False
                for others in sentence:
                    for place in range(len(others['tags'])):
                        if others['tags'][place][-2:]=='-V' and arg_points[place]:
                            abort=True
                            break
                        if abort:
                            break
                if abort:
                    continue
                subject=' '.join([text[x] for x in range(len(text)) if triplet['tags'][x] in ['I-ARG0','B-ARG0']])
                objekt=' '.join([text[x] for x in range(len(text)) if triplet['tags'][x] in ['I-ARG1','B-ARG1']])
                verb=triplet['verb']
                verb=utils.lemmatize(triplet['verb'])
                if verb in auxillary_verbs:
                    continue
                if len(subject)==0:
                    continue
                if len(objekt)==0:
                    continue
                verb = verb.upper()
                if verb not in verb_dict.keys():
                    print(verb)
                    continue #if verb does not exist in verb_dict it can not be used to create
                verb_id=verb_dict[verb]
                max_id=len(self.phrase_corpus)
                subject_id,subject=self.deduplicate(subject)
                if subject_id==max_id:
                    self.parsed.append(str(subject_id))
                tags=[str(subject_id) if triplet['tags'][x] in ['I-ARG0','B-ARG0'] else tags[x] for x in range(len(text))]

                max_id=len(self.phrase_corpus)
                objekt_id,objekt=self.deduplicate(objekt)
                if objekt_id==max_id:
                    self.parsed.append(str(objekt_id))
                tags=[str(objekt_id) if triplet['tags'][x] in ['I-ARG1','B-ARG1'] else tags[x] for x in range(len(text))]
                if (subject,objekt,verb) not in self.triplet:
                    self.triplet.append((subject,objekt,verb))
                    self.triplet_id.append((subject_id,verb_id,objekt_id))
                    self.parsed.append("str(len(self.phrase_corpus)+"+str(len(self.triplet))+")")
            self.parsed.append('str(len(self.phrase_corpus))+" -1"')
            text=['<phrase_'+str(tags[x])+'>' if tags[x] else text[x] for x in range(len(text))]
            text.append(None)
            text=[text[x] for x in range(len(text)-1) if (text[x]!=text[x+1] or text[x][0]!='<')]
            return_text=return_text+' '+' '.join(text)
        return self.phrase_corpus,self.triplet_id,return_text[1:],[eval(x, {"self": self}) for x in self.parsed]
コード例 #10
0
ファイル: leskalgorithm.py プロジェクト: anweshm4/ReWordTool
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm,
    described in Banerjee and Pederson (2002). It makes use of the lexical
    items from semantically related senses within the wordnet
    hierarchies and to generate more lexical items for each sense.
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() +
                                 ss.part_meronyms() + ss.part_holonyms() +
                                 ss.similar_tos() + ss.substance_holonyms() +
                                 ss.substance_meronyms()))

        try:
            signature = list([j for j in chain(*[i.lemma_names() for i in \
                      related_senses]) if j not in stopwords.words('english')])
        except:
            signature = list([j for j in chain(*[i.lemma_names for i in \
                      related_senses]) if j not in stopwords.words('english')])
            if j in stopwords.words('english')
                print "Error"
    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    ss_sign[ss]+=signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #11
0
def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]
コード例 #12
0
def max_similarity(context_sentence,
                   ambiguous_word,
                   option="path",
                   lemma=True,
                   context_is_lemmatized=False,
                   pos=None,
                   best=True):
    """
    Perform WSD by maximizing the sum of maximum similarity between possible 
    synsets of all words in the context sentence and the possible synsets of the 
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [
            lemmatize(w) for w in word_tokenize(context_sentence)
        ]
    result = {}
    for i in wn.synsets(ambiguous_word):
        try:
            if pos and pos != str(i.pos()):
                continue
        except:
            if pos and pos != str(i.pos):
                continue
        result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
                        for j in context_sentence)

    if option in ["res", "resnik"]:  # lower score = more similar
        result = sorted([(v, k) for k, v in result.items()])
    else:  # higher score = more similar
        result = sorted([(v, k) for k, v in result.items()], reverse=True)
    ##print result
    if best: return result[0][1]
    return result
コード例 #13
0
def simple_baseline_similarity(s1, s2):
    """
    Find the sequence similarity between two words considering lemmas and words
    """
    # Tokenize by sentences into words in lower case
    tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
    tokenized_sentence_2 = nltk.word_tokenize(s2.lower())

    tagged_sentence_1 = pos_tag(
        tokenized_sentence_1)  # [ (word, POS_TAG), ...]
    tagged_sentence_2 = pos_tag(
        tokenized_sentence_2)  # [ (word, POS_TAG), ...]

    lemmas_sentence_1 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1
        if not tagged_word in stop_words
    ]
    lemmas_sentence_2 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2
        if not tagged_word in stop_words
    ]  # [LEMMA_1, ...]

    word_seq_match = difflib.SequenceMatcher(None, tokenized_sentence_1,
                                             tokenized_sentence_2)
    word_match = word_seq_match.find_longest_match(0,
                                                   len(tokenized_sentence_1),
                                                   0,
                                                   len(tokenized_sentence_2))

    lemm_seq_match = difflib.SequenceMatcher(None, lemmas_sentence_1,
                                             lemmas_sentence_2)
    lemm_match = lemm_seq_match.find_longest_match(0, len(lemmas_sentence_1),
                                                   0, len(lemmas_sentence_2))

    word_sim = word_match.size / (
        max(len(tokenized_sentence_1), len(tokenized_sentence_2)) + 0.001)
    lemm_sim = lemm_match.size / (
        max(len(lemmas_sentence_1), len(lemmas_sentence_2)) + 0.001)

    return word_sim, lemm_sim
コード例 #14
0
def lemmatize_answers(df):
    """
    Given a pandas dataframe that contains a column called answer,
    returns a new dataframe with the lemmatized text from this column

    :param df: a pandas dataframe
    """

    # lemmas = [lemmatize(speech) for speech in tqdm(df['answer'])]
    tqdm.pandas()
    lemmas = df['answer'].progress_apply(lambda x: lemmatize(x))
    df['lemmatized_answer'] = lemmas

    return df
コード例 #15
0
ファイル: lesk.py プロジェクト: peterdm/pywsd
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """ 
    Returns a synsets_signatures dictionary that includes signature words of a 
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        # If POS is specified.
        try:
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        
        signature = []
        # Includes definition.
        try: signature+= ss.definition().split()
        except: signature+= ss.definition.split()
        # Includes examples
        try: signature+= list(chain(*[i.split() for i in ss.examples()]))
        except: signature+= list(chain(*[i.split() for i in ss.examples]))
        # Includes lemma_names.
        try: signature+= ss.lemma_names()
        except: signature+= ss.lemma_names
        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            try: signature+= list(chain(*[i.lemma_names() for i \
                                          in ss.hypernyms()+ss.hyponyms()]))
            except: signature+= list(chain(*[i.lemma_names for i \
                                             in ss.hypernyms()+ss.hyponyms()]))
        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in stopwords.words('english')]
        # Lemmatized context is preferred over stemmed context
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words causes sparsity, so optional matching for stems.
        if stem == True: 
            signature = [porter.stem(i) for i in signature]

        signature = [i.lower() for i in signature]

        synsets_signatures[ss] = signature
    
    return synsets_signatures
コード例 #16
0
def original_lesk(context_sentence, ambiguous_word, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728
    """
    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary:  # If dictionary is not provided, use the WN defintion.
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            ss_definition = synset_properties(ss, 'definition')
            dictionary[ss] = ss_definition
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)
    return best_sense
コード例 #17
0
 def clusterSentence(self, sentence):
     """
     clusters the given sentence with existing cluster or creates a
     new cluster.
     sentence - sentence to be clustered
     """
     words = utils.tokenize(sentence.lower())
     lems = utils.lemmatize(words)
     terms = utils.filterStopWords(lems)
     tf = dict(Counter(terms))
     self.clusterize(tf, sentence)  
  
     # Every time a new sentence is clusterized, save latest clusters
     self.saveClusters()
コード例 #18
0
ファイル: lesk.py プロジェクト: shreyg/GitFiles
def original_lesk(context_sentence, ambiguous_word, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728
    """
    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary: # If dictionary is not provided, use the WN defintion.
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            ss_definition = synset_properties(ss, 'definition')
            dictionary[ss] = ss_definition
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)
    return best_sense    
コード例 #19
0
def lemmas_similarity(s1, s2, filter_stop_words=True):
    """
    Jaccard lematized sentences similarity 
    """
    # Tokenize by sentences into words in lower case
    tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
    tokenized_sentence_2 = nltk.word_tokenize(s2.lower())

    if not filter_stop_words:
        tokenized_sentence_1 = [
            token for token in tokenized_sentence_1 if token not in stop_words
        ]
        tokenized_sentence_2 = [
            token for token in tokenized_sentence_2 if token not in stop_words
        ]

    tagged_sentence_1 = pos_tag(
        tokenized_sentence_1)  # [ (word, POS_TAG), ...]
    tagged_sentence_2 = pos_tag(
        tokenized_sentence_2)  # [ (word, POS_TAG), ...]

    lemmas_sentence_1 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1
    ]
    lemmas_sentence_2 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2
    ]  # [LEMMA_1, ...]

    # Compute similarity
    if len(lemmas_sentence_1) > 0 and len(lemmas_sentence_2) > 0:
        similarity = 1 - jaccard_distance(set(lemmas_sentence_1),
                                          set(lemmas_sentence_2))
        # Compute label of similarity
        return similarity
    else:
        return 0
コード例 #20
0
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """ 
    Returns a synsets_signatures dictionary that includes signature words of a 
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try:  # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature += ss_definition.split()
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature += list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature += ss_lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms + ss_hyponyms
            signature += list(
                chain(*[i.lemma_names() for i in ss_hypohypernyms]))

        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True:
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature

    return synsets_signatures
コード例 #21
0
ファイル: lesk.py プロジェクト: soloice/pywsd
def original_lesk(context_sentence, ambiguous_word, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://goo.gl/8TB15wb
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If dictionary is not provided, use the WN defintion.
    if not dictionary:
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            try: ss_definition = ss.definition().split()
            except: ss_definition = ss.definition.split()
            dictionary[ss] = ss_definition
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)
    return best_sense    
コード例 #22
0
ファイル: lesk.py プロジェクト: shreyg/GitFiles
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """ 
    Returns a synsets_signatures dictionary that includes signature words of a 
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try: # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature+=ss_definition
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature+=list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature+= ss_lemma_names
        
        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms+ss_hyponyms
            signature+= list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
        
        # Optional: removes stopwords.
        if stop == True: 
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True: 
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True: 
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature
        
    return synsets_signatures
コード例 #23
0
ファイル: classif.py プロジェクト: ArnaudBu/text-processing
def cl(s):
    # return rmv_smol_wds(rmv_digits(rmv_stp(cln(s))))
    a = clean(s,
              fix_unicode=True,
              to_ascii=True,
              lower=True,
              no_line_breaks=True,
              no_urls=True,
              no_emails=True,
              no_phone_numbers=True,
              no_numbers=True,
              no_emoji=True,
              replace_with_url=" ",
              replace_with_email=" ",
              replace_with_phone_number=" ",
              replace_with_number=" ",
              lang="en")
    return rmv_smol_wds(lemmatize(rmv_stp(a)))