コード例 #1
0
def wsd(summpath, wsdpath):
    summpathlist = os.listdir(summpath)
    for idp,dirpath in enumerate(summpathlist[0:1]):
        print(idp)
        '''
        idp=0
        dirpath=summpathlist[idp]
        '''
        comppath = summpath+dirpath+'/'
        comppathlist = os.listdir(comppath)
        for idc,compathdir in enumerate(comppathlist):
            '''
            idc=0
            compathdir=comppathlist[idc]
            '''
            fpath = comppath+compathdir
            f = open(fpath,'r+',encoding='utf-8')
            text = f.readlines()
            text = [re.sub(r'^\s+|\s+$','',x) for x in text if len(x)>3]
            nertext = []
            sentencewsd=copy.deepcopy(text)
            for ids,sentence in enumerate(text):
                '''
                ids=0
                sentence=text[ids]
                '''
                word = sentence.split()
                word = [re.sub(r'[\,]','',x) for x in word]
                word = [re.sub(r'(?<=\w)[\.]','',x) for x in word]
                ner =  st.tag(word)
                ner = [x[0] for x in ner if x[1]=='O']
                ner = [x for x in ner if x not in stops]
                sentence = ' '.join(ner)
                nertext.append(sentence)
            for zipf in zipf_freq:
                '''
                zipf=zipf_freq[0]
                '''
                textwsd = []
                for ids,sentence in enumerate(nertext):
                    '''
                    ids=0
                    sentence=text[ids]
                    '''
                    ambiguity = disambiguate(sentence, adapted_lesk, keepLemmas=True, zipf=zipf)
                    for idy,syn in enumerate(ambiguity):
                        '''
                        idy=3
                        syn=ambiguity[idy]
                        '''
                        if syn[2] is not None:
                            syn_lemma = syn[2].lemma_names()
                            syn_lemma = [[zipf_frequency(x, 'en'),x] for x in syn_lemma ]
                            syn_lemma = sorted(syn_lemma , reverse=True)
                            if syn_lemma[0][0]==0:
                                syn_lemma = [[len(x[1]),x[1]] for x in syn_lemma]
                                syn_lemma = sorted(syn_lemma , reverse=False)
                            if(lemmatize(syn[0].lower())!=syn_lemma[0][1]):
                                sentencewsd[ids] = re.sub(r''+syn[0],syn_lemma[0][1],sentencewsd[ids] )
                    textwsd.append(sentencewsd[ids])
                outDirectory = wsdpath+dirpath+'/'
                if not os.path.exists(outDirectory):
                    os.makedirs(outDirectory)
                fout = open(outDirectory+str(zipf)+'-'+compathdir,'w',encoding='utf-8')
                fout.writelines(textwsd)
                fout.flush()
                fout.close()
コード例 #2
0
ファイル: test_lesk_speed.py プロジェクト: alvations/pywsd
# Copyright (C) 2014-2015 alvations
# URL:
# For license information, see LICENSE.md

from __future__ import print_function

import time
from nltk.corpus import brown

from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.allwords_wsd import disambiguate

print("======== TESTING all-words lesk (`from_cache=True`)===========")
start = time.time()
for sentence in brown.sents()[:10]:
    sentence = " ".join(sentence)
    disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True)
    disambiguate(sentence, original_lesk)
    disambiguate(sentence, adapted_lesk, keepLemmas=True)
print('Disambiguating 100 brown sentences took {} secs'.format(time.time() - start))


print("======== TESTING all-words lesk (`from_cache=False`)===========")
start = time.time()
for sentence in brown.sents()[:10]:
    sentence = " ".join(sentence)
    disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True, from_cache=False)
    disambiguate(sentence, original_lesk, from_cache=False)
    disambiguate(sentence, adapted_lesk, keepLemmas=True, from_cache=False)
print('Disambiguating 10 brown sentences took {} secs'.format(time.time() - start))
コード例 #3
0
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import brown, stopwords


from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.similarity import max_similarity
from pywsd.utils import lemmatize
from pywsd.allwords_wsd import disambiguate

print "======== TESTING all-words lesk ===========\n"
for sentence in brown.sents()[:10]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Annotate the full sentence.
    print disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True)
    print disambiguate(sentence, original_lesk)
    print disambiguate(sentence, adapted_lesk, keepLemmas=True)
    print disambiguate(sentence, cosine_lesk, prefersNone=True)
    print
print

print "======== TESTING all-words path maxsim ===========\n"
print "This is going to take some time, have some coffee...\n"
for sentence in brown.sents()[0:1]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Annotate the full sentence.
    print disambiguate(sentence, max_similarity, similarity_option='path')
    print disambiguate(sentence, max_similarity, similarity_option='wup')
print
コード例 #4
0
from nltk import word_tokenize

from nltk.corpus import brown, stopwords

from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.similarity import max_similarity
from pywsd.utils import lemmatize
from pywsd.allwords_wsd import disambiguate

print("======== TESTING all-words lesk ===========\n")
for sentence in brown.sents()[:10]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Annotate the full sentence.
    print(disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True))
    print(disambiguate(sentence, original_lesk))
    print(disambiguate(sentence, adapted_lesk, keepLemmas=True))
    print(disambiguate(sentence, cosine_lesk, prefersNone=True))
    print()
print()

print("======== TESTING all-words path maxsim ===========\n")
print("This is going to take some time, have some coffee...\n")
for sentence in brown.sents()[0:1]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Annotate the full sentence.
    print(disambiguate(sentence, max_similarity, similarity_option='path'))
    print(disambiguate(sentence, max_similarity, similarity_option='wup'))
print()
コード例 #5
0
ファイル: consistency_test.py プロジェクト: alvations/pywsd
from string import punctuation

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import brown, stopwords

from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.similarity import max_similarity
from pywsd.utils import lemmatize, penn2morphy
from pywsd.allwords_wsd import disambiguate

"""
This module is to test for consistency between using the dismabiguate() and
individually calling wsd functions.
"""

for sentence in brown.sents()[:100]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Uses POS info when WSD-ing.
    _, poss = zip(*pos_tag(word_tokenize(sentence)))
    tagged_sent =  disambiguate(sentence, prefersNone=True, keepLemmas=True)

    for word_lemma_semtag, pos in zip(tagged_sent, poss):
        word, lemma, semtag = word_lemma_semtag
        if semtag is not None:
            # Changes POS to morphy POS
            pos = penn2morphy(pos, returnNone=True)
            # WSD on lemma
            assert simple_lesk(sentence, lemma, pos=pos) == semtag
コード例 #6
0
# For license information, see LICENSE.md

from string import punctuation

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import brown, stopwords

from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.similarity import max_similarity
from pywsd.utils import lemmatize, penn2morphy
from pywsd.allwords_wsd import disambiguate
"""
This module is to test for consistency between using the dismabiguate() and
individually calling wsd functions.
"""

for sentence in brown.sents()[:100]:
    # Retrieves a tokenized text from brown corpus.
    sentence = " ".join(sentence)
    # Uses POS info when WSD-ing.
    _, poss = zip(*pos_tag(word_tokenize(sentence)))
    tagged_sent = disambiguate(sentence, prefersNone=True, keepLemmas=True)

    for word_lemma_semtag, pos in zip(tagged_sent, poss):
        word, lemma, semtag = word_lemma_semtag
        if semtag is not None:
            # Changes POS to morphy POS
            pos = penn2morphy(pos, returnNone=True)
            # WSD on lemma
            assert simple_lesk(sentence, lemma, pos=pos) == semtag
コード例 #7
0
# URL:
# For license information, see LICENSE.md

from __future__ import print_function

import time
from nltk.corpus import brown

from pywsd.lesk import simple_lesk, original_lesk, cosine_lesk, adapted_lesk
from pywsd.allwords_wsd import disambiguate

print("======== TESTING all-words lesk (`from_cache=True`)===========")
start = time.time()
for sentence in brown.sents()[:10]:
    sentence = " ".join(sentence)
    disambiguate(sentence, simple_lesk, prefersNone=True, keepLemmas=True)
    disambiguate(sentence, original_lesk)
    disambiguate(sentence, adapted_lesk, keepLemmas=True)
print('Disambiguating 100 brown sentences took {} secs'.format(time.time() -
                                                               start))

print("======== TESTING all-words lesk (`from_cache=False`)===========")
start = time.time()
for sentence in brown.sents()[:10]:
    sentence = " ".join(sentence)
    disambiguate(sentence,
                 simple_lesk,
                 prefersNone=True,
                 keepLemmas=True,
                 from_cache=False)
    disambiguate(sentence, original_lesk, from_cache=False)