class NGramAnalyzer(object): """ Takes a generic bit of text and outputs NGrams along with their associated frequencies. """ # Static Processor to reduce load times in memory processor = TextProcessor() def __init__(self, text, N=1, preprocessed=False): self.N = N self.text = text self._frequency = Histogram() self.preprocessed = preprocessed @property def frequency(self): if not self._frequency: for ngram in self: self._frequency.increment(ngram) return self._frequency def tokenize(self): """ Tokenize using processor. """ if not self.preprocessed: text = self.processor.process(self.text) else: text = self.processor.deserialize(self.text) for sentence in text: for token, postag in sentence: yield token.lower() def __iter__(self): if self.N == 1: # Special case for Unigrams for token in self.tokenize(): yield token else: ngram = [] for token in self.tokenize(): if len(ngram) < self.N: ngram.append(token) if len(ngram) == self.N: yield tuple(ngram) ngram = ngram[1:] def __len__(self): return self.frequency.total
def __init__(self, text, N=1, preprocessed=False): self.N = N self.text = text self._frequency = Histogram() self.preprocessed = preprocessed