コード例 #1
0
def bigram_feats(md):
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams
            # after removing punctuation
            wordList = util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
            wordList = [x for x in wordList if util.non_numeric(x)]
            bigrams = zip(wordList, wordList[1:])
            c.update(token for token in bigrams)
    return c
コード例 #2
0
def unigram_feats(md):
    """
    arguments:
      md is a util.MovieData object
    returns:
      a dictionary containing a mapping from unigram features from the reviews
      to their values on this util.MovieData object
    """
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams
            # after removing punctuation
            c.update([token for token in
                        util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
                          if util.non_numeric(token)])
    return c
コード例 #3
0
def unigram_feats(md):
    """
    arguments:
      md is a util.MovieData object
    returns:
      a dictionary containing a mapping from unigram features from the reviews
      to their values on this util.MovieData object
    """
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams 
            # after removing punctuation
            c.update([token for token in 
                        util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
                          if util.non_numeric(token)])
    return c