def addFeaturesPhrases( ctx, outFeaturesMaps ):
    logging.getLogger("Features").info( "phrases" )
    rawFilteredWords = [ word for ( word, count ) in ctx.mFilteredWords ]
    for itrComment, phrases in enumerate(ctx.mCommentPhrases):
        for phrase in phrases:
            prevWord='$'
            for itrWord, (word, partOfSpeech) in enumerate( phrase ):
                if ( MinerMiscUtils.isAdj( partOfSpeech ) or MinerMiscUtils.isNoun( partOfSpeech ) ):
                    stemmedWord = word
                    
                    phrase1 = prevWord + " " + stemmedWord
                    phrase2 = stemmedWord + " " + prevWord
                    
                    if(phrase1 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[ itrComment ][ phrase1 ] = 1
                    elif(phrase2 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[ itrComment ][ phrase2 ] = 1  
                   
                    prevWord = stemmedWord
Beispiel #2
0
def addFeaturesPhrases(ctx, outFeaturesMaps):
    logging.getLogger("Features").info("phrases")
    rawFilteredWords = [word for (word, count) in ctx.mFilteredWords]
    for itrComment, phrases in enumerate(ctx.mCommentPhrases):
        for phrase in phrases:
            prevWord = '$'
            for itrWord, (word, partOfSpeech) in enumerate(phrase):
                if (MinerMiscUtils.isAdj(partOfSpeech)
                        or MinerMiscUtils.isNoun(partOfSpeech)):
                    stemmedWord = word

                    phrase1 = prevWord + " " + stemmedWord
                    phrase2 = stemmedWord + " " + prevWord

                    if (phrase1 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[itrComment][phrase1] = 1
                    elif (phrase2 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[itrComment][phrase2] = 1

                    prevWord = stemmedWord
    def __init__(self, mRawCsvComments, mRawCsvReviews, filterWordReviewOverlap):
        # Log parameters
        logging.getLogger("Context").info("Creating new context:")
        logging.getLogger("Context").info("filterWordReviewOverlap: " + str(filterWordReviewOverlap))

        # Load stop words
        self.mStopWords = nltk.corpus.stopwords.words("english")

        self.mRawCsvReviews = mRawCsvReviews

        self.mRawCsvComments = mRawCsvComments

        # Parallel list of lower case comments with punctuation removed
        self.mLowerCasePunctRemovedComments = []

        # Parallel list for storing [ (word, part-of-speech ) ] tuple lists for each comment
        self.mPartOfSpeechTokenizedComments = []

        # Parallel list for storing [ stemmed(word) ] lists for each comment
        self.mStemmedTokenizedComments = []

        # Set for storing unique review identifiers
        self.mReviewIds = set()

        # Maps a stemmed word to a set of reviews it belongs to (for filtering kindle, ipod, etc)
        self.mStemmedWordToReviewsMap = {}

        # Create stemmer for stemming words
        stemmer = PorterStemmer()

        # Dictionary for storing word counts of adjectives and nouns
        self.mAdjAndNounWordCountMap = {}

        # Dictionary for storing custom data specific to a classifier
        self.mCustomData = {}

        self.mPartOfSpeechTokenizedCommentsAndReviewId = []

        self.mAuthorFreqPerReview = {}

        self.mAuthorReviewPerComment = []

        self.mCommentPhrases = []

        productCount = {}
        self.productAvgStars = {}
        for rawReview in self.mRawCsvReviews:
            if rawReview["Product"] not in self.productAvgStars.keys():
                self.productAvgStars[rawReview["Product"]] = float(rawReview["Star Rating"])
                productCount[rawReview["Product"]] = 1
            else:
                self.productAvgStars[rawReview["Product"]] += float(rawReview["Star Rating"])
                productCount[rawReview["Product"]] += 1

        for key in self.productAvgStars.keys():
            self.productAvgStars[key] = float(self.productAvgStars[key]) / float(productCount[key])

        self.mReviewAuthorMap = {}
        self.mReviewStarMap = {}
        for rawReview in self.mRawCsvReviews:
            self.mReviewAuthorMap[rawReview["Review_ID"]] = rawReview["Author"]
            self.mReviewStarMap[rawReview["Review_ID"]] = rawReview["Star Rating"]

        # Convert to lower case, remove punctuation, and assign parts of speech, etc...
        for itrComment, rawCsvCommentDict in enumerate(self.mRawCsvComments):
            logging.getLogger("Context").info(
                "Processing (1-gram) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments))
            )

            # Extract review identifier
            reviewId = rawCsvCommentDict["Review_ID"]

            # Extract author of comment
            author = rawCsvCommentDict["Author"]

            if reviewId not in self.mAuthorFreqPerReview.keys():
                self.mAuthorFreqPerReview[reviewId] = {}
                self.mAuthorFreqPerReview[reviewId][author] = 1
            elif author not in self.mAuthorFreqPerReview[reviewId].keys():
                self.mAuthorFreqPerReview[reviewId][author] = 1
            else:
                self.mAuthorFreqPerReview[reviewId][author] += 1

            self.mAuthorReviewPerComment.append((reviewId, author))

            # Append any unique review identifiers
            self.mReviewIds.update([reviewId])

            # Convert comment to lower case
            comment = rawCsvCommentDict["Comment"].lower()

            punctTokenizedComment = nltk.WordPunctTokenizer().tokenize(comment)

            phraseSeparators = [".", "?", "!", ";"]

            phrases = []
            phrase = []
            for word in punctTokenizedComment:
                if word in phraseSeparators:
                    phrase = [phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords)]
                    phrase = nltk.pos_tag(phrase)
                    phrase = [(stemmer.stem(word), part) for (word, part) in phrase]
                    phrases.append(phrase)
                    phrase = []
                else:
                    phrase.append(word)

            if len(phrase) > 0:
                phrase = [phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords)]
                phrase = nltk.pos_tag(phrase)
                phrases.append(phrase)

            self.mCommentPhrases.append(phrases)

            # Replace punctuation with white space
            for punct in string.punctuation:
                comment = comment.replace(punct, " ")

            self.mLowerCasePunctRemovedComments.append(comment)

            # Tokenize into list of words
            tokenizedComment = nltk.word_tokenize(comment)

            # Filter out stop words
            tokenizedComment[:] = [word for word in tokenizedComment if (word not in self.mStopWords)]

            posTagComment = nltk.pos_tag(tokenizedComment)
            # Append a list of (word, part of speech) tuples
            self.mPartOfSpeechTokenizedComments.append(posTagComment)

            self.mPartOfSpeechTokenizedCommentsAndReviewId.append((posTagComment, reviewId))

            # Append a list of stemmed words
            self.mStemmedTokenizedComments.append([])
            self.mStemmedTokenizedComments[-1][:] = [stemmer.stem(word) for word in tokenizedComment]

            # Assert parallel lists are same length
            assert len(self.mPartOfSpeechTokenizedComments[-1]) == len(self.mStemmedTokenizedComments[-1])

            # Determine word counts for nouns and adjectives
            for itr, (word, partOfSpeech) in enumerate(self.mPartOfSpeechTokenizedComments[-1]):
                # Determine if part of speech is noun or adjective
                if MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech):
                    # Obtain stemmed word
                    stemmedWord = self.mStemmedTokenizedComments[-1][itr]
                    # Increment stemmed word counts
                    if stemmedWord in self.mAdjAndNounWordCountMap:
                        self.mAdjAndNounWordCountMap[stemmedWord] += 1
                        self.mStemmedWordToReviewsMap[stemmedWord].update([reviewId])
                    else:
                        self.mAdjAndNounWordCountMap[stemmedWord] = 1
                        self.mStemmedWordToReviewsMap[stemmedWord] = set([reviewId])
            # end inner for loop : iteration of (word, part of speech) tuples in single comment
        # end outer for loop : iteration over raw csv comment data

        # Assert parallel arrays are same length
        assert len(self.mRawCsvComments) == len(self.mLowerCasePunctRemovedComments)
        assert len(self.mLowerCasePunctRemovedComments) == len(self.mPartOfSpeechTokenizedComments)
        assert len(self.mPartOfSpeechTokenizedComments) == len(self.mStemmedTokenizedComments)

        # Set of words filtered by word counts: extract only words between threshold count ranges
        fGetWordReviewOverlap = lambda stemmedWord: float(len(self.mStemmedWordToReviewsMap[stemmedWord])) / float(
            len(self.mReviewIds)
        )
        self.mFilteredWords = [
            (word, count)
            for (word, count) in self.mAdjAndNounWordCountMap.iteritems()
            if (fGetWordReviewOverlap(word) > filterWordReviewOverlap)
        ]

        # Use the resulting filtered words as possible components of a phrase
        self.mPossiblePhraseWords = [word for (word, count) in self.mFilteredWords]

        # Count of 2-gram occurences
        self.mTwoGramsCountMap = {}

        # Count of the number of reviews the 2-grams occur in
        self.mTwoGramsToReviewsMap = {}

        # Extract all 2-grams from the comments
        for itrComment, (tokComment, reviewId) in enumerate(self.mPartOfSpeechTokenizedCommentsAndReviewId):
            logging.getLogger("Context").info(
                "Processing (2-grams) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments))
            )

            # Keeps track of the previous word scanned
            prevWord = "$"
            for itr, (word, partOfSpeech) in enumerate(tokComment):

                # Determine if part of speech is noun or adjective
                if MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech):

                    # Obtain stemmed word
                    stemmedWord = stemmer.stem(word)

                    # Increment stemmed 2-gram counts
                    if stemmedWord in self.mPossiblePhraseWords or prevWord in self.mPossiblePhraseWords:
                        phrase1 = prevWord + " " + stemmedWord
                        phrase2 = stemmedWord + " " + prevWord
                        defaultPhrase = phrase1

                        if phrase2 in self.mTwoGramsCountMap.keys():
                            defaultPhrase = phrase2
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(set(reviewId))
                        elif phrase1 in self.mTwoGramsCountMap.keys():
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(set(reviewId))
                        else:
                            self.mTwoGramsCountMap[defaultPhrase] = 1
                            self.mTwoGramsToReviewsMap[defaultPhrase] = set(reviewId)

                    prevWord = stemmedWord

        # Extract all 2-grams that occur frequently enough across reviews to care about and add them to the set of "filtered words"
        # TODO: There should really be a separate collection for 2-grams
        for twoGram in self.mTwoGramsCountMap.keys():
            if (float(len(self.mTwoGramsToReviewsMap[twoGram]))) / float(len(self.mReviewIds)) > (
                filterWordReviewOverlap * filterWordReviewOverlap
            ):
                self.mFilteredWords.append((twoGram, self.mTwoGramsCountMap[twoGram]))

        self.printFilteredWords()
Beispiel #4
0
    def __init__(self, mRawCsvComments, mRawCsvReviews,
                 filterWordReviewOverlap):
        # Log parameters
        logging.getLogger("Context").info("Creating new context:")
        logging.getLogger("Context").info("filterWordReviewOverlap: " +
                                          str(filterWordReviewOverlap))

        # Load stop words
        self.mStopWords = nltk.corpus.stopwords.words('english')

        self.mRawCsvReviews = mRawCsvReviews

        self.mRawCsvComments = mRawCsvComments

        # Parallel list of lower case comments with punctuation removed
        self.mLowerCasePunctRemovedComments = []

        # Parallel list for storing [ (word, part-of-speech ) ] tuple lists for each comment
        self.mPartOfSpeechTokenizedComments = []

        # Parallel list for storing [ stemmed(word) ] lists for each comment
        self.mStemmedTokenizedComments = []

        # Set for storing unique review identifiers
        self.mReviewIds = set()

        # Maps a stemmed word to a set of reviews it belongs to (for filtering kindle, ipod, etc)
        self.mStemmedWordToReviewsMap = {}

        # Create stemmer for stemming words
        stemmer = PorterStemmer()

        # Dictionary for storing word counts of adjectives and nouns
        self.mAdjAndNounWordCountMap = {}

        # Dictionary for storing custom data specific to a classifier
        self.mCustomData = {}

        self.mPartOfSpeechTokenizedCommentsAndReviewId = []

        self.mAuthorFreqPerReview = {}

        self.mAuthorReviewPerComment = []

        self.mCommentPhrases = []

        productCount = {}
        self.productAvgStars = {}
        for rawReview in self.mRawCsvReviews:
            if (rawReview["Product"] not in self.productAvgStars.keys()):
                self.productAvgStars[rawReview["Product"]] = float(
                    rawReview["Star Rating"])
                productCount[rawReview["Product"]] = 1
            else:
                self.productAvgStars[rawReview["Product"]] += float(
                    rawReview["Star Rating"])
                productCount[rawReview["Product"]] += 1

        for key in self.productAvgStars.keys():
            self.productAvgStars[key] = float(
                self.productAvgStars[key]) / float(productCount[key])

        self.mReviewAuthorMap = {}
        self.mReviewStarMap = {}
        for rawReview in self.mRawCsvReviews:
            self.mReviewAuthorMap[rawReview["Review_ID"]] = rawReview["Author"]
            self.mReviewStarMap[
                rawReview["Review_ID"]] = rawReview["Star Rating"]

        # Convert to lower case, remove punctuation, and assign parts of speech, etc...
        for itrComment, rawCsvCommentDict in enumerate(self.mRawCsvComments):
            logging.getLogger("Context").info("Processing (1-gram) comment " +
                                              str(itrComment) + " of " +
                                              str(len(self.mRawCsvComments)))

            # Extract review identifier
            reviewId = rawCsvCommentDict["Review_ID"]

            # Extract author of comment
            author = rawCsvCommentDict["Author"]

            if reviewId not in self.mAuthorFreqPerReview.keys():
                self.mAuthorFreqPerReview[reviewId] = {}
                self.mAuthorFreqPerReview[reviewId][author] = 1
            elif author not in self.mAuthorFreqPerReview[reviewId].keys():
                self.mAuthorFreqPerReview[reviewId][author] = 1
            else:
                self.mAuthorFreqPerReview[reviewId][author] += 1

            self.mAuthorReviewPerComment.append((reviewId, author))

            # Append any unique review identifiers
            self.mReviewIds.update([reviewId])

            # Convert comment to lower case
            comment = rawCsvCommentDict["Comment"].lower()

            punctTokenizedComment = nltk.WordPunctTokenizer().tokenize(comment)

            phraseSeparators = ['.', '?', '!', ';']

            phrases = []
            phrase = []
            for word in punctTokenizedComment:
                if word in phraseSeparators:
                    phrase = [
                        phraseWord for phraseWord in phrase
                        if (phraseWord not in self.mStopWords)
                    ]
                    phrase = nltk.pos_tag(phrase)
                    phrase = [(stemmer.stem(word), part)
                              for (word, part) in phrase]
                    phrases.append(phrase)
                    phrase = []
                else:
                    phrase.append(word)

            if len(phrase) > 0:
                phrase = [
                    phraseWord for phraseWord in phrase
                    if (phraseWord not in self.mStopWords)
                ]
                phrase = nltk.pos_tag(phrase)
                phrases.append(phrase)

            self.mCommentPhrases.append(phrases)

            # Replace punctuation with white space
            for punct in string.punctuation:
                comment = comment.replace(punct, " ")

            self.mLowerCasePunctRemovedComments.append(comment)

            # Tokenize into list of words
            tokenizedComment = nltk.word_tokenize(comment)

            # Filter out stop words
            tokenizedComment[:] = [
                word for word in tokenizedComment
                if (word not in self.mStopWords)
            ]

            posTagComment = nltk.pos_tag(tokenizedComment)
            # Append a list of (word, part of speech) tuples
            self.mPartOfSpeechTokenizedComments.append(posTagComment)

            self.mPartOfSpeechTokenizedCommentsAndReviewId.append(
                (posTagComment, reviewId))

            # Append a list of stemmed words
            self.mStemmedTokenizedComments.append([])
            self.mStemmedTokenizedComments[-1][:] = [
                stemmer.stem(word) for word in tokenizedComment
            ]

            # Assert parallel lists are same length
            assert (len(self.mPartOfSpeechTokenizedComments[-1]) == len(
                self.mStemmedTokenizedComments[-1]))

            # Determine word counts for nouns and adjectives
            for itr, (word, partOfSpeech) in enumerate(
                    self.mPartOfSpeechTokenizedComments[-1]):
                # Determine if part of speech is noun or adjective
                if (MinerMiscUtils.isAdj(partOfSpeech)
                        or MinerMiscUtils.isNoun(partOfSpeech)):
                    # Obtain stemmed word
                    stemmedWord = self.mStemmedTokenizedComments[-1][itr]
                    # Increment stemmed word counts
                    if (stemmedWord in self.mAdjAndNounWordCountMap):
                        self.mAdjAndNounWordCountMap[stemmedWord] += 1
                        self.mStemmedWordToReviewsMap[stemmedWord].update(
                            [reviewId])
                    else:
                        self.mAdjAndNounWordCountMap[stemmedWord] = 1
                        self.mStemmedWordToReviewsMap[stemmedWord] = set(
                            [reviewId])
            # end inner for loop : iteration of (word, part of speech) tuples in single comment
        # end outer for loop : iteration over raw csv comment data

        # Assert parallel arrays are same length
        assert (len(self.mRawCsvComments) == len(
            self.mLowerCasePunctRemovedComments))
        assert (len(self.mLowerCasePunctRemovedComments) == len(
            self.mPartOfSpeechTokenizedComments))
        assert (len(self.mPartOfSpeechTokenizedComments) == len(
            self.mStemmedTokenizedComments))

        # Set of words filtered by word counts: extract only words between threshold count ranges
        fGetWordReviewOverlap = lambda stemmedWord: float(
            len(self.mStemmedWordToReviewsMap[stemmedWord])) / float(
                len(self.mReviewIds))
        self.mFilteredWords = [
            (word, count)
            for (word, count) in self.mAdjAndNounWordCountMap.iteritems()
            if (fGetWordReviewOverlap(word) > filterWordReviewOverlap)
        ]

        #Use the resulting filtered words as possible components of a phrase
        self.mPossiblePhraseWords = [
            word for (word, count) in self.mFilteredWords
        ]

        #Count of 2-gram occurences
        self.mTwoGramsCountMap = {}

        #Count of the number of reviews the 2-grams occur in
        self.mTwoGramsToReviewsMap = {}

        #Extract all 2-grams from the comments
        for itrComment, (tokComment, reviewId) in enumerate(
                self.mPartOfSpeechTokenizedCommentsAndReviewId):
            logging.getLogger("Context").info("Processing (2-grams) comment " +
                                              str(itrComment) + " of " +
                                              str(len(self.mRawCsvComments)))

            #Keeps track of the previous word scanned
            prevWord = "$"
            for itr, (word, partOfSpeech) in enumerate(tokComment):

                # Determine if part of speech is noun or adjective
                if (MinerMiscUtils.isAdj(partOfSpeech)
                        or MinerMiscUtils.isNoun(partOfSpeech)):

                    # Obtain stemmed word
                    stemmedWord = stemmer.stem(word)

                    # Increment stemmed 2-gram counts
                    if (stemmedWord in self.mPossiblePhraseWords
                            or prevWord in self.mPossiblePhraseWords):
                        phrase1 = prevWord + " " + stemmedWord
                        phrase2 = stemmedWord + " " + prevWord
                        defaultPhrase = phrase1

                        if (phrase2 in self.mTwoGramsCountMap.keys()):
                            defaultPhrase = phrase2
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(
                                set(reviewId))
                        elif (phrase1 in self.mTwoGramsCountMap.keys()):
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(
                                set(reviewId))
                        else:
                            self.mTwoGramsCountMap[defaultPhrase] = 1
                            self.mTwoGramsToReviewsMap[defaultPhrase] = set(
                                reviewId)

                    prevWord = stemmedWord

        #Extract all 2-grams that occur frequently enough across reviews to care about and add them to the set of "filtered words"
        #TODO: There should really be a separate collection for 2-grams
        for twoGram in self.mTwoGramsCountMap.keys():
            if (float(len(self.mTwoGramsToReviewsMap[twoGram]))) / float(
                    len(self.mReviewIds)) > (filterWordReviewOverlap *
                                             filterWordReviewOverlap):
                self.mFilteredWords.append(
                    (twoGram, self.mTwoGramsCountMap[twoGram]))

        self.printFilteredWords()