Esempio n. 1
0
def extractWordPieces():

    word = 'supercalifragilisticexpialidocious'
    re.findall(r'[aeiou]', word)
    len(re.findall(r'[aeiou]', word))

    wsj = sorted(set(nltk.corpus.treebank.words()))
    fd = nltk.FreqDist(vs for word in wsj
            for vs in re.findall(r'[aeiou]{2,}', word))
    fd.items()

    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])


    rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
    cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cfd = nltk.ConditionalFreqDist(cvs)
    cfd.tabulate()


    cv_word_pairs = [(cv, w) for w in rotokas_words
            for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cv_index = nltk.Index(cv_word_pairs)
    cv_index['su']
    cv_index['po']
Esempio n. 2
0
def extractWordPieces():

    word = 'supercalifragilisticexpialidocious'
    re.findall(r'[aeiou]', word)
    len(re.findall(r'[aeiou]', word))

    wsj = sorted(set(nltk.corpus.treebank.words()))
    fd = nltk.FreqDist(vs for word in wsj
                       for vs in re.findall(r'[aeiou]{2,}', word))
    fd.items()

    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])

    rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
    cvs = [
        cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)
    ]
    cfd = nltk.ConditionalFreqDist(cvs)
    cfd.tabulate()

    cv_word_pairs = [(cv, w) for w in rotokas_words
                     for cv in re.findall(r'[ptksvr][aeiou]', w)]
    cv_index = nltk.Index(cv_word_pairs)
    cv_index['su']
    cv_index['po']
Esempio n. 3
0
def Analysis(corpus):
    raw = open(corpus, "rU").read()

    words, sents, text = Tokenize(raw)

    random_sentence = randomSentence(sents)

    fdist, cfd, top_words = FreqDists(words)

    print nltk.tokenwrap(Przm(w) for w in words[:20])
    print fdist
    cfd.tabulate()
Esempio n. 4
0
def regex_compress():
    def compress(word):
        pieces = re.findall(regexp, word)
        return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
    return nltk.tokenwrap(compress(w) for w in english_udhr[:75])
Esempio n. 5
0
    def generate(self, length=100):
        """"""
        # Change tokens
        self.tokens = nltk.word_tokenize(
            self.__words[randint(1, len(self.__words)) - 1])

        estimator = lambda fdist, bins: nltk.LidstoneProbDist(
            fdist, self.__random.random())
        #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2)
        self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15),
                                              self, estimator)
        #self._trigram_model = nltk.NgramModel(3, self, estimator)
        text = self._trigram_model.generate(length)
        return nltk.tokenwrap(text)
Esempio n. 6
0
nacute_utf = nacute.encode('utf8')
print repr(nacute_utf)

#正则表达式
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print nltk.tokenwrap(compress(w) for w in english_udhr[:75])
#处理词干

def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
#非贪婪
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
Esempio n. 7
0
def compress_vowels():
    # initial vowel sequence, final vowel sequence or consonents,
    # everything else is removed
    regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
    english_udhr = nltk.corpus.udhr.words("English-Latin1")
    print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])
Esempio n. 8
0
raw.rfind('End of Project')

# regular expression
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
[w for w in wordlist if re.search('ed$',w) and len(w)==3]
[w for w in wordlist if re.search('ed$',w) and len(w)==4]
[w for w in wordlist if re.search('^..j..t..$',w)]
[w for w in wordlist if re.search('^[abdc][efgh][ijkl]$',w)]

regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp,word)
    return ''.join(pieces)
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print (nltk.tokenwrap(compress(w) for w in english_udhr[:100]))

# suffix, stemming
re.findall(r'^.*(ing|ly|ed|ious|ive|es|s|ment)$','processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ive|es|s|ment)$','processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ive|es|s|ment)$','processes')
re.findall(r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$','processes')                                    
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp,word)[0]
    return stem
stem('processing')

# searching tokenized text
moby = nltk.Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a>(<.*>)<man>")
Esempio n. 9
0
def testCompress():
    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print nltk.tokenwrap(compress(w) for w in english_udhr[:75])
Esempio n. 10
0
def compress_vowels():
    # initial vowel sequence, final vowel sequence or consonents,
    # everything else is removed
    regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
    english_udhr = nltk.corpus.udhr.words("English-Latin1")
    print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])
Esempio n. 11
0
 def lossy_compression(self):
     pattern = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
     print nltk.tokenwrap(self.compress(pattern, w) for w in self.wordlist)