Beispiel #1
0
 def __init__(self):
     self.pf = ParseFiling()
     self.text = self.pf.find_div_with_text()
     print "Finished fetching Google SEC document" 
     self.tokens = nltk.word_tokenize(self.text)
     print "Tokenizing using NLTK"
     self.wordlist = [w for w in self.tokens if w.islower()]
     print "Fetched all lowercase words"   
Beispiel #2
0
class RegexExamples():
    def __init__(self):
        self.pf = ParseFiling()
        self.text = self.pf.find_div_with_text()
        print "Finished fetching Google SEC document" 
        self.tokens = nltk.word_tokenize(self.text)
        print "Tokenizing using NLTK"
        self.wordlist = [w for w in self.tokens if w.islower()]
        print "Fetched all lowercase words"   

    def first_example(self):
        match = re.findall(r'seriali[sz]e', STR, re.M|re.I)
        if match:
            print "Matches are: ", match

    def find_words_ending_with(self, pattern):
        pattern = pattern + '$'
        words_ending_with_pattern = [w for w in self.wordlist if re.search(pattern, w)]
        return words_ending_with_pattern

    def find_words_with_pattern(self, pattern):
        words_with_pattern = [w for w in self.wordlist if re.search(pattern, w)]
        return words_with_pattern

    def compress(self, pattern, word):
        pieces = re.findall(pattern, word)
        return ''.join(pieces)

    def lossy_compression(self):
        pattern = '^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
        print nltk.tokenwrap(self.compress(pattern, w) for w in self.wordlist)

    def stem(self, word, pattern=DEFAULT):
        stem, suffix = re.findall(pattern, word)[0]
        return stem

    def stemmer(self):
        return [self.stem(w) for w in self.tokens]