def extractWordPieces(): word = 'supercalifragilisticexpialidocious' re.findall(r'[aeiou]', word) len(re.findall(r'[aeiou]', word)) wsj = sorted(set(nltk.corpus.treebank.words())) fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word)) fd.items() regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' def compress(word): pieces = re.findall(regexp, word) return ''.join(pieces) english_udhr = nltk.corpus.udhr.words('English-Latin1') print nltk.tokenwrap(compress(w) for w in english_udhr[:75]) rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] cfd = nltk.ConditionalFreqDist(cvs) cfd.tabulate() cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] cv_index = nltk.Index(cv_word_pairs) cv_index['su'] cv_index['po']
def extractWordPieces(): word = 'supercalifragilisticexpialidocious' re.findall(r'[aeiou]', word) len(re.findall(r'[aeiou]', word)) wsj = sorted(set(nltk.corpus.treebank.words())) fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word)) fd.items() regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' def compress(word): pieces = re.findall(regexp, word) return ''.join(pieces) english_udhr = nltk.corpus.udhr.words('English-Latin1') print nltk.tokenwrap(compress(w) for w in english_udhr[:75]) rotokas_words = nltk.corpus.toolbox.words('rotokas.dic') cvs = [ cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w) ] cfd = nltk.ConditionalFreqDist(cvs) cfd.tabulate() cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)] cv_index = nltk.Index(cv_word_pairs) cv_index['su'] cv_index['po']
def Analysis(corpus): raw = open(corpus, "rU").read() words, sents, text = Tokenize(raw) random_sentence = randomSentence(sents) fdist, cfd, top_words = FreqDists(words) print nltk.tokenwrap(Przm(w) for w in words[:20]) print fdist cfd.tabulate()
def regex_compress(): def compress(word): pieces = re.findall(regexp, word) return ''.join(pieces) english_udhr = nltk.corpus.udhr.words('English-Latin1') regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' return nltk.tokenwrap(compress(w) for w in english_udhr[:75])
def generate(self, length=100): """""" # Change tokens self.tokens = nltk.word_tokenize( self.__words[randint(1, len(self.__words)) - 1]) estimator = lambda fdist, bins: nltk.LidstoneProbDist( fdist, self.__random.random()) #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2) self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15), self, estimator) #self._trigram_model = nltk.NgramModel(3, self, estimator) text = self._trigram_model.generate(length) return nltk.tokenwrap(text)
nacute_utf = nacute.encode('utf8') print repr(nacute_utf) #正则表达式 import re wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()] [w for w in wordlist if re.search('ed$', w)] [w for w in wordlist if re.search('^..j..t..$', w)] [w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)] regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' def compress(word): pieces = re.findall(regexp, word) return ''.join(pieces) english_udhr = nltk.corpus.udhr.words('English-Latin1') print nltk.tokenwrap(compress(w) for w in english_udhr[:75]) #处理词干 def stem(word): for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: if word.endswith(suffix): return word[:-len(suffix)] return word re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') #非贪婪 re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes') def stem(word): regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
def compress_vowels(): # initial vowel sequence, final vowel sequence or consonents, # everything else is removed regex = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]" english_udhr = nltk.corpus.udhr.words("English-Latin1") print nltk.tokenwrap([compress(regex, w) for w in english_udhr[:75]])
raw.rfind('End of Project') # regular expression import re wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()] [w for w in wordlist if re.search('ed$',w) and len(w)==3] [w for w in wordlist if re.search('ed$',w) and len(w)==4] [w for w in wordlist if re.search('^..j..t..$',w)] [w for w in wordlist if re.search('^[abdc][efgh][ijkl]$',w)] regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[AEIOUaeiou]' def compress(word): pieces = re.findall(regexp,word) return ''.join(pieces) english_udhr = nltk.corpus.udhr.words('English-Latin1') print (nltk.tokenwrap(compress(w) for w in english_udhr[:100])) # suffix, stemming re.findall(r'^.*(ing|ly|ed|ious|ive|es|s|ment)$','processing') re.findall(r'^.*(?:ing|ly|ed|ious|ive|es|s|ment)$','processing') re.findall(r'^(.*)(ing|ly|ed|ious|ive|es|s|ment)$','processes') re.findall(r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$','processes') def stem(word): regexp = r'^(.*?)(ing|ly|ed|ious|ive|es|s|ment)?$' stem, suffix = re.findall(regexp,word)[0] return stem stem('processing') # searching tokenized text moby = nltk.Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) moby.findall(r"<a>(<.*>)<man>")
def testCompress(): english_udhr = nltk.corpus.udhr.words('English-Latin1') print nltk.tokenwrap(compress(w) for w in english_udhr[:75])
def lossy_compression(self): pattern = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]' print nltk.tokenwrap(self.compress(pattern, w) for w in self.wordlist)