class GramFreq: """provides the utility for basic corpus analytics; also supports advanced collocation mining abilities """ def __init__(self, n): """tracks the frequency distribution; n is the length of the desired grams to be computed and indexed """ self.n = n self.freq = {} self.text_cleaner = TextCleaner() def index(self, document): """tokenizes a document, computes n-grams from that token stream and moves the computed n-grams to the freq """ # clean and tokenize the incoming text tokens = filter(lambda x: x not in stopwords, gram_tokenize(self.text_cleaner.clean(document))) # create sequence of n-grams; n is argument grams = set([i for i in ngrams(tokens, self.n)]) for gram in grams: self.freq[' '.join(gram)] = self.freq.get(' '.join(gram), 0) + 1 def dump(self, filename): """dumps the computed freq dict to disk as a JSON string """ with open(filename, 'w') as outfile: json.dump(self.freq, outfile) def load(self, filename): """loads a previously computed freq dict from disk and use in analyses, etc. """ with open(filename, 'r') as infile: self.freq = simplejson.loads(infile.read())
def __init__(self, n): """tracks the frequency distribution; n is the length of the desired grams to be computed and indexed """ self.n = n self.freq = {} self.text_cleaner = TextCleaner()