Esempi in Python per tokenwrap, esempi in Python per nltk.tokenwrap

Esempio n. 1

0

Mostra file

File: c03_re.py Progetto: AkiraKane/Python

def extractWordPieces():

    word = 'supercalifragilisticexpialidocious'
    re.findall(r'[aeiou]', word)
    len(re.findall(r'[aeiou]', word))

    wsj = sorted(set(nltk.corpus.treebank.words()))
    fd = nltk.FreqDist(vs for word in wsj
            for vs in re.findall(r'[aeiou]{2,}', word))
    fd.items()

    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])


    rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
    cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cfd = nltk.ConditionalFreqDist(cvs)
    cfd.tabulate()


    cv_word_pairs = [(cv, w) for w in rotokas_words
            for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cv_index = nltk.Index(cv_word_pairs)
    cv_index['su']
    cv_index['po']

Esempio n. 2

0

Mostra file

def extractWordPieces():

    word = 'supercalifragilisticexpialidocious'
    re.findall(r'[aeiou]', word)
    len(re.findall(r'[aeiou]', word))

    wsj = sorted(set(nltk.corpus.treebank.words()))
    fd = nltk.FreqDist(vs for word in wsj
                       for vs in re.findall(r'[aeiou]{2,}', word))
    fd.items()

    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])

    rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
    cvs = [
        cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)
    ]
    cfd = nltk.ConditionalFreqDist(cvs)
    cfd.tabulate()

    cv_word_pairs = [(cv, w) for w in rotokas_words
                     for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cv_index = nltk.Index(cv_word_pairs)
    cv_index['su']
    cv_index['po']

Esempio n. 3

0

Mostra file

File: pperish-run.py Progetto: angelaambroz/nltk-or-perish

def Analysis(corpus):
    raw = open(corpus, "rU").read()

    words, sents, text = Tokenize(raw)

    random_sentence = randomSentence(sents)

    fdist, cfd, top_words = FreqDists(words)

    print nltk.tokenwrap(Przm(w) for w in words[:20])
    print fdist
    cfd.tabulate()

Esempio n. 4

0

Mostra file

def regex_compress():
    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
    return nltk.tokenwrap(compress(w) for w in english_udhr[:75])

Esempio n. 5

0

Mostra file

    def generate(self, length=100):
        """"""
        # Change tokens
        self.tokens = nltk.word_tokenize(
            self.__words[randint(1, len(self.__words)) - 1])

        estimator = lambda fdist, bins: nltk.LidstoneProbDist(
            fdist, self.__random.random())
        #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2)
        self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15),
                                              self, estimator)
        #self._trigram_model = nltk.NgramModel(3, self, estimator)
        text = self._trigram_model.generate(length)
        return nltk.tokenwrap(text)

Esempio n. 6

0

Mostra file

nacute_utf = nacute.encode('utf8')
print repr(nacute_utf)

#正则表达式
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print nltk.tokenwrap(compress(w) for w in english_udhr[:75])
#处理词干

def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
#非贪婪
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'

Esempio n. 7

0

Mostra file

File: ch03.py Progetto: prashiyn/nltk-examples

def compress_vowels():
    # initial vowel sequence, final vowel sequence or consonents,
    # everything else is removed
    regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
    english_udhr = nltk.corpus.udhr.words("English-Latin1")
    print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])

Esempio n. 8

0

Mostra file

File: basicsReview.py Progetto: z-o-e/exploreNLP

raw.rfind('End of Project')

# regular expression
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
[w for w in wordlist if re.search('ed$',w) and len(w)==3]
[w for w in wordlist if re.search('ed$',w) and len(w)==4]
[w for w in wordlist if re.search('^..j..t..$',w)]
[w for w in wordlist if re.search('^[abdc][efgh][ijkl]$',w)]

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp,word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print (nltk.tokenwrap(compress(w) for w in english_udhr[:100]))

# suffix, stemming
re.findall(r'^.*(ing|ly|ed|ious|ive|es|s|ment)$','processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ive|es|s|ment)$','processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ive|es|s|ment)$','processes')
re.findall(r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$','processes')                                    
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp,word)[0]
    return stem
stem('processing')

# searching tokenized text
moby = nltk.Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a>(<.*>)<man>")

Esempio n. 9

0

Mostra file

File: chapter3.py Progetto: hbdhj/python

def testCompress():
    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])

Esempio n. 10

0

Mostra file

def compress_vowels():
    # initial vowel sequence, final vowel sequence or consonents,
    # everything else is removed
    regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
    english_udhr = nltk.corpus.udhr.words("English-Latin1")
    print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])

Esempio n. 11

0

Mostra file

File: regex_examples.py Progetto: MARS87/ieor242

 def lossy_compression(self):
     pattern = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
     print nltk.tokenwrap(self.compress(pattern, w) for w in self.wordlist)