def bigram_feats(md): c = Counter() for rev in util.MovieData.reviewers: if hasattr(md,rev): # count occurrences of asciified, lowercase, non-numeric unigrams # after removing punctuation wordList = util.punct_patt.sub("", util.asciify(md.__dict__[rev].strip().lower())).split() wordList = [x for x in wordList if util.non_numeric(x)] bigrams = zip(wordList, wordList[1:]) c.update(token for token in bigrams) return c
def unigram_feats(md): """ arguments: md is a util.MovieData object returns: a dictionary containing a mapping from unigram features from the reviews to their values on this util.MovieData object """ c = Counter() for rev in util.MovieData.reviewers: if hasattr(md,rev): # count occurrences of asciified, lowercase, non-numeric unigrams # after removing punctuation c.update([token for token in util.punct_patt.sub("", util.asciify(md.__dict__[rev].strip().lower())).split() if util.non_numeric(token)]) return c