Example #1
0
class NGramAnalyzer(object):
    """
    Takes a generic bit of text and outputs NGrams along with their
    associated frequencies.
    """

    # Static Processor to reduce load times in memory
    processor = TextProcessor()

    def __init__(self, text, N=1, preprocessed=False):
        self.N = N
        self.text = text
        self._frequency = Histogram()
        self.preprocessed = preprocessed

    @property
    def frequency(self):
        if not self._frequency:
            for ngram in self:
                self._frequency.increment(ngram)
        return self._frequency

    def tokenize(self):
        """
        Tokenize using processor.
        """
        if not self.preprocessed:
            text = self.processor.process(self.text)
        else:
            text = self.processor.deserialize(self.text)

        for sentence in text:
            for token, postag in sentence:
                yield token.lower()

    def __iter__(self):
        if self.N == 1:
            # Special case for Unigrams
            for token in self.tokenize(): yield token
        else:
            ngram = []
            for token in self.tokenize():
                if len(ngram) < self.N:
                    ngram.append(token)
                if len(ngram) == self.N:
                    yield tuple(ngram)
                    ngram = ngram[1:]

    def __len__(self):
        return self.frequency.total
Example #2
0
 def __init__(self, text, N=1, preprocessed=False):
     self.N = N
     self.text = text
     self._frequency = Histogram()
     self.preprocessed = preprocessed