コード例 #1
0
def SvmGetClassifierInputs(ctx, featuresMaps, outClassifierInputs):
    logging.getLogger("Svm").info("get classifier inputs")
    outClassifierInputs[:] = []

    featuresKeys = set()

    for featureVector in featuresMaps:
        featuresKeys.update(featureVector.keys())
    #featuresKeys = featuresMaps[0].keys() # Assuming at least a single features map exists
    for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
        # @TODO: Classify "Thumbs Down!"
        svmType = -1 + 2 * MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
        inputsCollector = [SvmUtilGetStrSign(svmType) + str(svmType)]
        for itrFeature, featureKey in enumerate(featuresKeys):
            if (featureKey in featuresMaps[itrComment]):
                featureValue = -1 + 2 * int(
                    featuresMaps[itrComment][featureKey])
            else:
                featureValue = 0

            inputsCollector.append(" " + str(itrFeature + 1) + ":" +
                                   str(featureValue))

        outClassifierInputs.append("".join(inputsCollector))
    assert (len(outClassifierInputs) == len(ctx.mRawCsvComments))
コード例 #2
0
def CAR_get_comment_labels(ctx):
    logging.getLogger("CAR").info("get comment labels")
    outCommentLabels = []
    for rawCsvCommentDict in ctx.mRawCsvComments:
        outCommentLabels.append(
            MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
    return outCommentLabels
コード例 #3
0
def NaiveBayesGetClassifierInputs( ctx, featuresMaps, outClassifierInputs, bTrain ):
    logging.getLogger("NaiveBayes").info( "get classifier inputs" )
    outClassifierInputs[:] = []
    for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
        strLabel = "?"
        if ( bTrain ):
            strLabel = str(MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
        outClassifierInputs.append( ( featuresMaps[ itrComment ], strLabel ) )
コード例 #4
0
def NaiveBayesGetClassifierInputs(ctx, featuresMaps, outClassifierInputs,
                                  bTrain):
    logging.getLogger("NaiveBayes").info("get classifier inputs")
    outClassifierInputs[:] = []
    for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
        strLabel = "?"
        if (bTrain):
            strLabel = str(MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
        outClassifierInputs.append((featuresMaps[itrComment], strLabel))
コード例 #5
0
def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap):
    ctx = False
    if cacheFileName == None or MinerMiscUtils.fileExists(cacheFileName) == False:
        ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap)
        if cacheFileName != None:
            pickle.dump(ctx, open(cacheFileName, "wb"))
    else:
        logging.getLogger("Context").info("Found context cache: " + cacheFileName)
        ctx = pickle.load(open(cacheFileName))
    return ctx
コード例 #6
0
def CAR_conditional_apriori(ctx, featuresMaps, cacheFileName, minSup=0.1, minConf=0.5):
    logging.getLogger("CAR").info("conditional apriori")
    # See if cache exists
    if MinerMiscUtils.fileExists(cacheFileName) == False:
        CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf)
    else:
        logging.getLogger("CAR").info("Found cache: " + cacheFileName)

    FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName))
    return FHistFlattenedFeaturesMapPair
コード例 #7
0
def addFeaturesPhrases( ctx, outFeaturesMaps ):
    logging.getLogger("Features").info( "phrases" )
    rawFilteredWords = [ word for ( word, count ) in ctx.mFilteredWords ]
    for itrComment, phrases in enumerate(ctx.mCommentPhrases):
        for phrase in phrases:
            prevWord='$'
            for itrWord, (word, partOfSpeech) in enumerate( phrase ):
                if ( MinerMiscUtils.isAdj( partOfSpeech ) or MinerMiscUtils.isNoun( partOfSpeech ) ):
                    stemmedWord = word
                    
                    phrase1 = prevWord + " " + stemmedWord
                    phrase2 = stemmedWord + " " + prevWord
                    
                    if(phrase1 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[ itrComment ][ phrase1 ] = 1
                    elif(phrase2 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[ itrComment ][ phrase2 ] = 1  
                   
                    prevWord = stemmedWord
コード例 #8
0
def addFeaturesPhrases(ctx, outFeaturesMaps):
    logging.getLogger("Features").info("phrases")
    rawFilteredWords = [word for (word, count) in ctx.mFilteredWords]
    for itrComment, phrases in enumerate(ctx.mCommentPhrases):
        for phrase in phrases:
            prevWord = '$'
            for itrWord, (word, partOfSpeech) in enumerate(phrase):
                if (MinerMiscUtils.isAdj(partOfSpeech)
                        or MinerMiscUtils.isNoun(partOfSpeech)):
                    stemmedWord = word

                    phrase1 = prevWord + " " + stemmedWord
                    phrase2 = stemmedWord + " " + prevWord

                    if (phrase1 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[itrComment][phrase1] = 1
                    elif (phrase2 in rawFilteredWords):
                        #print "Adding:" + phrase1
                        outFeaturesMaps[itrComment][phrase2] = 1

                    prevWord = stemmedWord
コード例 #9
0
def CAR_conditional_apriori(ctx,
                            featuresMaps,
                            cacheFileName,
                            minSup=0.1,
                            minConf=0.5):
    logging.getLogger("CAR").info("conditional apriori")
    # See if cache exists
    if (MinerMiscUtils.fileExists(cacheFileName) == False):
        CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf)
    else:
        logging.getLogger("CAR").info("Found cache: " + cacheFileName)

    FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName))
    return FHistFlattenedFeaturesMapPair
コード例 #10
0
def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews,
                filterWordReviewOverlap):
    ctx = False
    if (cacheFileName == None
            or MinerMiscUtils.fileExists(cacheFileName) == False):
        ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews,
                      filterWordReviewOverlap)
        if (cacheFileName != None):
            pickle.dump(ctx, open(cacheFileName, "wb"))
    else:
        logging.getLogger("Context").info("Found context cache: " +
                                          cacheFileName)
        ctx = pickle.load(open(cacheFileName))
    return ctx
コード例 #11
0
def SvmGetClassifierInputs( ctx, featuresMaps, outClassifierInputs ):
    logging.getLogger("Svm").info( "get classifier inputs" )
    outClassifierInputs[:] = []
    
    featuresKeys = set()
    
    for featureVector in featuresMaps:
        featuresKeys.update(featureVector.keys())
    #featuresKeys = featuresMaps[0].keys() # Assuming at least a single features map exists
    for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
        # @TODO: Classify "Thumbs Down!"
        svmType = -1 + 2 * MinerMiscUtils.getCommentLabel( rawCsvCommentDict )
        inputsCollector = [SvmUtilGetStrSign( svmType ) + str(svmType)]
        for itrFeature, featureKey in enumerate( featuresKeys ):
            if(featureKey in featuresMaps[itrComment]):
                featureValue = -1 + 2*int(featuresMaps[itrComment][ featureKey ])
            else:
                featureValue=0
                
            inputsCollector.append( " " + str( itrFeature+1 ) + ":" + str(featureValue) )
            
        outClassifierInputs.append( "".join( inputsCollector ) )
    assert( len( outClassifierInputs ) == len( ctx.mRawCsvComments ) )    
コード例 #12
0
def addFeaturesDist( ctx, outFeaturesMaps ):
    logging.getLogger("Features").info( "Distance" )

    # Centroids for each class label
    centroids = [{}, {}]    
    centroidsCacheFileName = "centroidsCache.txt"
    if ( MinerMiscUtils.fileExists(centroidsCacheFileName)):
        # Load from cache!
        centroids = pickle.load( open( centroidsCacheFileName ) )
    else:
        # Sum up all features vectors
        for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
            label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
            for key, value in outFeaturesMaps[ itrComment ].iteritems():
                if ( type( value ) is str ):
                    print "BREAK = " + key + " = " + value + "\n"
                if ( key in centroids[label]):
                    centroids[label][key] += value
                else:
                    centroids[label][key] = value
                
                for altLabel in range( len(centroids ) ):
                    if ( altLabel != label ):
                        if key not in centroids[ altLabel ]:
                            centroids[ altLabel ][key] = 0.0
        
        # Average the centroids
        for centroid in centroids:
            for key, value in centroid.iteritems():
                value /= len( outFeaturesMaps )
                centroid[ key ] = value
        
        # Cache the centroids to disk
        pickle.dump( centroids, open( centroidsCacheFileName, "wb" ) )
        
    # Determine distance from both centroids
    distances = [ [], [] ]
    averageDistance = [ 0.0, 0.0 ]    
    for featuresMap in outFeaturesMaps:
        for label, centroid in enumerate(centroids):
            totalSqDist = 0.0
            for centroidKey, centroidValue in centroid.iteritems():
                commentValue = 0.0
                if ( centroidKey in featuresMap ):
                    commentValue = featuresMap[ centroidKey ]
                sqDist = commentValue - centroidValue
                sqDist *= sqDist
                totalSqDist += sqDist
            totalDist = math.sqrt( totalSqDist )
            distances[label].append( totalDist )
            averageDistance[label] += totalDist 
    
    for label in range( len( averageDistance ) ):
        averageDistance[ label ] /= len( outFeaturesMaps )
    
    # Determine standard deviation
    averageStdDev = [ 0, 0 ]
    for label, labelDistances in enumerate( distances ):
        for distance in labelDistances:
            sqDistFromMean = distance - averageDistance[ label ]
            sqDistFromMean *= sqDistFromMean
            averageStdDev[ label ] += sqDistFromMean
    
    for label in range( len( averageStdDev ) ):
        averageStdDev[ label ] /= len( outFeaturesMaps )
        averageStdDev[ label ]  = math.sqrt( averageStdDev[ label ] )
        
    # Map all feature vectors as being closer or farther from std dev
    for itrComment, featuresMap in enumerate(outFeaturesMaps):
        for label, stdDev in enumerate( averageStdDev ):
            featuresMap[ "Dist--"+str(label) ] = distances[ label ][ itrComment ] > stdDev
コード例 #13
0
    def __init__(self, mRawCsvComments, mRawCsvReviews, filterWordReviewOverlap):
        # Log parameters
        logging.getLogger("Context").info("Creating new context:")
        logging.getLogger("Context").info("filterWordReviewOverlap: " + str(filterWordReviewOverlap))

        # Load stop words
        self.mStopWords = nltk.corpus.stopwords.words("english")

        self.mRawCsvReviews = mRawCsvReviews

        self.mRawCsvComments = mRawCsvComments

        # Parallel list of lower case comments with punctuation removed
        self.mLowerCasePunctRemovedComments = []

        # Parallel list for storing [ (word, part-of-speech ) ] tuple lists for each comment
        self.mPartOfSpeechTokenizedComments = []

        # Parallel list for storing [ stemmed(word) ] lists for each comment
        self.mStemmedTokenizedComments = []

        # Set for storing unique review identifiers
        self.mReviewIds = set()

        # Maps a stemmed word to a set of reviews it belongs to (for filtering kindle, ipod, etc)
        self.mStemmedWordToReviewsMap = {}

        # Create stemmer for stemming words
        stemmer = PorterStemmer()

        # Dictionary for storing word counts of adjectives and nouns
        self.mAdjAndNounWordCountMap = {}

        # Dictionary for storing custom data specific to a classifier
        self.mCustomData = {}

        self.mPartOfSpeechTokenizedCommentsAndReviewId = []

        self.mAuthorFreqPerReview = {}

        self.mAuthorReviewPerComment = []

        self.mCommentPhrases = []

        productCount = {}
        self.productAvgStars = {}
        for rawReview in self.mRawCsvReviews:
            if rawReview["Product"] not in self.productAvgStars.keys():
                self.productAvgStars[rawReview["Product"]] = float(rawReview["Star Rating"])
                productCount[rawReview["Product"]] = 1
            else:
                self.productAvgStars[rawReview["Product"]] += float(rawReview["Star Rating"])
                productCount[rawReview["Product"]] += 1

        for key in self.productAvgStars.keys():
            self.productAvgStars[key] = float(self.productAvgStars[key]) / float(productCount[key])

        self.mReviewAuthorMap = {}
        self.mReviewStarMap = {}
        for rawReview in self.mRawCsvReviews:
            self.mReviewAuthorMap[rawReview["Review_ID"]] = rawReview["Author"]
            self.mReviewStarMap[rawReview["Review_ID"]] = rawReview["Star Rating"]

        # Convert to lower case, remove punctuation, and assign parts of speech, etc...
        for itrComment, rawCsvCommentDict in enumerate(self.mRawCsvComments):
            logging.getLogger("Context").info(
                "Processing (1-gram) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments))
            )

            # Extract review identifier
            reviewId = rawCsvCommentDict["Review_ID"]

            # Extract author of comment
            author = rawCsvCommentDict["Author"]

            if reviewId not in self.mAuthorFreqPerReview.keys():
                self.mAuthorFreqPerReview[reviewId] = {}
                self.mAuthorFreqPerReview[reviewId][author] = 1
            elif author not in self.mAuthorFreqPerReview[reviewId].keys():
                self.mAuthorFreqPerReview[reviewId][author] = 1
            else:
                self.mAuthorFreqPerReview[reviewId][author] += 1

            self.mAuthorReviewPerComment.append((reviewId, author))

            # Append any unique review identifiers
            self.mReviewIds.update([reviewId])

            # Convert comment to lower case
            comment = rawCsvCommentDict["Comment"].lower()

            punctTokenizedComment = nltk.WordPunctTokenizer().tokenize(comment)

            phraseSeparators = [".", "?", "!", ";"]

            phrases = []
            phrase = []
            for word in punctTokenizedComment:
                if word in phraseSeparators:
                    phrase = [phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords)]
                    phrase = nltk.pos_tag(phrase)
                    phrase = [(stemmer.stem(word), part) for (word, part) in phrase]
                    phrases.append(phrase)
                    phrase = []
                else:
                    phrase.append(word)

            if len(phrase) > 0:
                phrase = [phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords)]
                phrase = nltk.pos_tag(phrase)
                phrases.append(phrase)

            self.mCommentPhrases.append(phrases)

            # Replace punctuation with white space
            for punct in string.punctuation:
                comment = comment.replace(punct, " ")

            self.mLowerCasePunctRemovedComments.append(comment)

            # Tokenize into list of words
            tokenizedComment = nltk.word_tokenize(comment)

            # Filter out stop words
            tokenizedComment[:] = [word for word in tokenizedComment if (word not in self.mStopWords)]

            posTagComment = nltk.pos_tag(tokenizedComment)
            # Append a list of (word, part of speech) tuples
            self.mPartOfSpeechTokenizedComments.append(posTagComment)

            self.mPartOfSpeechTokenizedCommentsAndReviewId.append((posTagComment, reviewId))

            # Append a list of stemmed words
            self.mStemmedTokenizedComments.append([])
            self.mStemmedTokenizedComments[-1][:] = [stemmer.stem(word) for word in tokenizedComment]

            # Assert parallel lists are same length
            assert len(self.mPartOfSpeechTokenizedComments[-1]) == len(self.mStemmedTokenizedComments[-1])

            # Determine word counts for nouns and adjectives
            for itr, (word, partOfSpeech) in enumerate(self.mPartOfSpeechTokenizedComments[-1]):
                # Determine if part of speech is noun or adjective
                if MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech):
                    # Obtain stemmed word
                    stemmedWord = self.mStemmedTokenizedComments[-1][itr]
                    # Increment stemmed word counts
                    if stemmedWord in self.mAdjAndNounWordCountMap:
                        self.mAdjAndNounWordCountMap[stemmedWord] += 1
                        self.mStemmedWordToReviewsMap[stemmedWord].update([reviewId])
                    else:
                        self.mAdjAndNounWordCountMap[stemmedWord] = 1
                        self.mStemmedWordToReviewsMap[stemmedWord] = set([reviewId])
            # end inner for loop : iteration of (word, part of speech) tuples in single comment
        # end outer for loop : iteration over raw csv comment data

        # Assert parallel arrays are same length
        assert len(self.mRawCsvComments) == len(self.mLowerCasePunctRemovedComments)
        assert len(self.mLowerCasePunctRemovedComments) == len(self.mPartOfSpeechTokenizedComments)
        assert len(self.mPartOfSpeechTokenizedComments) == len(self.mStemmedTokenizedComments)

        # Set of words filtered by word counts: extract only words between threshold count ranges
        fGetWordReviewOverlap = lambda stemmedWord: float(len(self.mStemmedWordToReviewsMap[stemmedWord])) / float(
            len(self.mReviewIds)
        )
        self.mFilteredWords = [
            (word, count)
            for (word, count) in self.mAdjAndNounWordCountMap.iteritems()
            if (fGetWordReviewOverlap(word) > filterWordReviewOverlap)
        ]

        # Use the resulting filtered words as possible components of a phrase
        self.mPossiblePhraseWords = [word for (word, count) in self.mFilteredWords]

        # Count of 2-gram occurences
        self.mTwoGramsCountMap = {}

        # Count of the number of reviews the 2-grams occur in
        self.mTwoGramsToReviewsMap = {}

        # Extract all 2-grams from the comments
        for itrComment, (tokComment, reviewId) in enumerate(self.mPartOfSpeechTokenizedCommentsAndReviewId):
            logging.getLogger("Context").info(
                "Processing (2-grams) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments))
            )

            # Keeps track of the previous word scanned
            prevWord = "$"
            for itr, (word, partOfSpeech) in enumerate(tokComment):

                # Determine if part of speech is noun or adjective
                if MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech):

                    # Obtain stemmed word
                    stemmedWord = stemmer.stem(word)

                    # Increment stemmed 2-gram counts
                    if stemmedWord in self.mPossiblePhraseWords or prevWord in self.mPossiblePhraseWords:
                        phrase1 = prevWord + " " + stemmedWord
                        phrase2 = stemmedWord + " " + prevWord
                        defaultPhrase = phrase1

                        if phrase2 in self.mTwoGramsCountMap.keys():
                            defaultPhrase = phrase2
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(set(reviewId))
                        elif phrase1 in self.mTwoGramsCountMap.keys():
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(set(reviewId))
                        else:
                            self.mTwoGramsCountMap[defaultPhrase] = 1
                            self.mTwoGramsToReviewsMap[defaultPhrase] = set(reviewId)

                    prevWord = stemmedWord

        # Extract all 2-grams that occur frequently enough across reviews to care about and add them to the set of "filtered words"
        # TODO: There should really be a separate collection for 2-grams
        for twoGram in self.mTwoGramsCountMap.keys():
            if (float(len(self.mTwoGramsToReviewsMap[twoGram]))) / float(len(self.mReviewIds)) > (
                filterWordReviewOverlap * filterWordReviewOverlap
            ):
                self.mFilteredWords.append((twoGram, self.mTwoGramsCountMap[twoGram]))

        self.printFilteredWords()
コード例 #14
0
def addFeaturesDist(ctx, outFeaturesMaps):
    logging.getLogger("Features").info("Distance")

    # Centroids for each class label
    centroids = [{}, {}]
    centroidsCacheFileName = "centroidsCache.txt"
    if (MinerMiscUtils.fileExists(centroidsCacheFileName)):
        # Load from cache!
        centroids = pickle.load(open(centroidsCacheFileName))
    else:
        # Sum up all features vectors
        for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
            label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
            for key, value in outFeaturesMaps[itrComment].iteritems():
                if (type(value) is str):
                    print "BREAK = " + key + " = " + value + "\n"
                if (key in centroids[label]):
                    centroids[label][key] += value
                else:
                    centroids[label][key] = value

                for altLabel in range(len(centroids)):
                    if (altLabel != label):
                        if key not in centroids[altLabel]:
                            centroids[altLabel][key] = 0.0

        # Average the centroids
        for centroid in centroids:
            for key, value in centroid.iteritems():
                value /= len(outFeaturesMaps)
                centroid[key] = value

        # Cache the centroids to disk
        pickle.dump(centroids, open(centroidsCacheFileName, "wb"))

    # Determine distance from both centroids
    distances = [[], []]
    averageDistance = [0.0, 0.0]
    for featuresMap in outFeaturesMaps:
        for label, centroid in enumerate(centroids):
            totalSqDist = 0.0
            for centroidKey, centroidValue in centroid.iteritems():
                commentValue = 0.0
                if (centroidKey in featuresMap):
                    commentValue = featuresMap[centroidKey]
                sqDist = commentValue - centroidValue
                sqDist *= sqDist
                totalSqDist += sqDist
            totalDist = math.sqrt(totalSqDist)
            distances[label].append(totalDist)
            averageDistance[label] += totalDist

    for label in range(len(averageDistance)):
        averageDistance[label] /= len(outFeaturesMaps)

    # Determine standard deviation
    averageStdDev = [0, 0]
    for label, labelDistances in enumerate(distances):
        for distance in labelDistances:
            sqDistFromMean = distance - averageDistance[label]
            sqDistFromMean *= sqDistFromMean
            averageStdDev[label] += sqDistFromMean

    for label in range(len(averageStdDev)):
        averageStdDev[label] /= len(outFeaturesMaps)
        averageStdDev[label] = math.sqrt(averageStdDev[label])

    # Map all feature vectors as being closer or farther from std dev
    for itrComment, featuresMap in enumerate(outFeaturesMaps):
        for label, stdDev in enumerate(averageStdDev):
            featuresMap["Dist--" +
                        str(label)] = distances[label][itrComment] > stdDev
コード例 #15
0
def CAR_get_comment_labels(ctx):
    logging.getLogger("CAR").info("get comment labels")
    outCommentLabels = []
    for rawCsvCommentDict in ctx.mRawCsvComments:
        outCommentLabels.append(MinerMiscUtils.getCommentLabel(rawCsvCommentDict))
    return outCommentLabels
コード例 #16
0
    def __init__(self, mRawCsvComments, mRawCsvReviews,
                 filterWordReviewOverlap):
        # Log parameters
        logging.getLogger("Context").info("Creating new context:")
        logging.getLogger("Context").info("filterWordReviewOverlap: " +
                                          str(filterWordReviewOverlap))

        # Load stop words
        self.mStopWords = nltk.corpus.stopwords.words('english')

        self.mRawCsvReviews = mRawCsvReviews

        self.mRawCsvComments = mRawCsvComments

        # Parallel list of lower case comments with punctuation removed
        self.mLowerCasePunctRemovedComments = []

        # Parallel list for storing [ (word, part-of-speech ) ] tuple lists for each comment
        self.mPartOfSpeechTokenizedComments = []

        # Parallel list for storing [ stemmed(word) ] lists for each comment
        self.mStemmedTokenizedComments = []

        # Set for storing unique review identifiers
        self.mReviewIds = set()

        # Maps a stemmed word to a set of reviews it belongs to (for filtering kindle, ipod, etc)
        self.mStemmedWordToReviewsMap = {}

        # Create stemmer for stemming words
        stemmer = PorterStemmer()

        # Dictionary for storing word counts of adjectives and nouns
        self.mAdjAndNounWordCountMap = {}

        # Dictionary for storing custom data specific to a classifier
        self.mCustomData = {}

        self.mPartOfSpeechTokenizedCommentsAndReviewId = []

        self.mAuthorFreqPerReview = {}

        self.mAuthorReviewPerComment = []

        self.mCommentPhrases = []

        productCount = {}
        self.productAvgStars = {}
        for rawReview in self.mRawCsvReviews:
            if (rawReview["Product"] not in self.productAvgStars.keys()):
                self.productAvgStars[rawReview["Product"]] = float(
                    rawReview["Star Rating"])
                productCount[rawReview["Product"]] = 1
            else:
                self.productAvgStars[rawReview["Product"]] += float(
                    rawReview["Star Rating"])
                productCount[rawReview["Product"]] += 1

        for key in self.productAvgStars.keys():
            self.productAvgStars[key] = float(
                self.productAvgStars[key]) / float(productCount[key])

        self.mReviewAuthorMap = {}
        self.mReviewStarMap = {}
        for rawReview in self.mRawCsvReviews:
            self.mReviewAuthorMap[rawReview["Review_ID"]] = rawReview["Author"]
            self.mReviewStarMap[
                rawReview["Review_ID"]] = rawReview["Star Rating"]

        # Convert to lower case, remove punctuation, and assign parts of speech, etc...
        for itrComment, rawCsvCommentDict in enumerate(self.mRawCsvComments):
            logging.getLogger("Context").info("Processing (1-gram) comment " +
                                              str(itrComment) + " of " +
                                              str(len(self.mRawCsvComments)))

            # Extract review identifier
            reviewId = rawCsvCommentDict["Review_ID"]

            # Extract author of comment
            author = rawCsvCommentDict["Author"]

            if reviewId not in self.mAuthorFreqPerReview.keys():
                self.mAuthorFreqPerReview[reviewId] = {}
                self.mAuthorFreqPerReview[reviewId][author] = 1
            elif author not in self.mAuthorFreqPerReview[reviewId].keys():
                self.mAuthorFreqPerReview[reviewId][author] = 1
            else:
                self.mAuthorFreqPerReview[reviewId][author] += 1

            self.mAuthorReviewPerComment.append((reviewId, author))

            # Append any unique review identifiers
            self.mReviewIds.update([reviewId])

            # Convert comment to lower case
            comment = rawCsvCommentDict["Comment"].lower()

            punctTokenizedComment = nltk.WordPunctTokenizer().tokenize(comment)

            phraseSeparators = ['.', '?', '!', ';']

            phrases = []
            phrase = []
            for word in punctTokenizedComment:
                if word in phraseSeparators:
                    phrase = [
                        phraseWord for phraseWord in phrase
                        if (phraseWord not in self.mStopWords)
                    ]
                    phrase = nltk.pos_tag(phrase)
                    phrase = [(stemmer.stem(word), part)
                              for (word, part) in phrase]
                    phrases.append(phrase)
                    phrase = []
                else:
                    phrase.append(word)

            if len(phrase) > 0:
                phrase = [
                    phraseWord for phraseWord in phrase
                    if (phraseWord not in self.mStopWords)
                ]
                phrase = nltk.pos_tag(phrase)
                phrases.append(phrase)

            self.mCommentPhrases.append(phrases)

            # Replace punctuation with white space
            for punct in string.punctuation:
                comment = comment.replace(punct, " ")

            self.mLowerCasePunctRemovedComments.append(comment)

            # Tokenize into list of words
            tokenizedComment = nltk.word_tokenize(comment)

            # Filter out stop words
            tokenizedComment[:] = [
                word for word in tokenizedComment
                if (word not in self.mStopWords)
            ]

            posTagComment = nltk.pos_tag(tokenizedComment)
            # Append a list of (word, part of speech) tuples
            self.mPartOfSpeechTokenizedComments.append(posTagComment)

            self.mPartOfSpeechTokenizedCommentsAndReviewId.append(
                (posTagComment, reviewId))

            # Append a list of stemmed words
            self.mStemmedTokenizedComments.append([])
            self.mStemmedTokenizedComments[-1][:] = [
                stemmer.stem(word) for word in tokenizedComment
            ]

            # Assert parallel lists are same length
            assert (len(self.mPartOfSpeechTokenizedComments[-1]) == len(
                self.mStemmedTokenizedComments[-1]))

            # Determine word counts for nouns and adjectives
            for itr, (word, partOfSpeech) in enumerate(
                    self.mPartOfSpeechTokenizedComments[-1]):
                # Determine if part of speech is noun or adjective
                if (MinerMiscUtils.isAdj(partOfSpeech)
                        or MinerMiscUtils.isNoun(partOfSpeech)):
                    # Obtain stemmed word
                    stemmedWord = self.mStemmedTokenizedComments[-1][itr]
                    # Increment stemmed word counts
                    if (stemmedWord in self.mAdjAndNounWordCountMap):
                        self.mAdjAndNounWordCountMap[stemmedWord] += 1
                        self.mStemmedWordToReviewsMap[stemmedWord].update(
                            [reviewId])
                    else:
                        self.mAdjAndNounWordCountMap[stemmedWord] = 1
                        self.mStemmedWordToReviewsMap[stemmedWord] = set(
                            [reviewId])
            # end inner for loop : iteration of (word, part of speech) tuples in single comment
        # end outer for loop : iteration over raw csv comment data

        # Assert parallel arrays are same length
        assert (len(self.mRawCsvComments) == len(
            self.mLowerCasePunctRemovedComments))
        assert (len(self.mLowerCasePunctRemovedComments) == len(
            self.mPartOfSpeechTokenizedComments))
        assert (len(self.mPartOfSpeechTokenizedComments) == len(
            self.mStemmedTokenizedComments))

        # Set of words filtered by word counts: extract only words between threshold count ranges
        fGetWordReviewOverlap = lambda stemmedWord: float(
            len(self.mStemmedWordToReviewsMap[stemmedWord])) / float(
                len(self.mReviewIds))
        self.mFilteredWords = [
            (word, count)
            for (word, count) in self.mAdjAndNounWordCountMap.iteritems()
            if (fGetWordReviewOverlap(word) > filterWordReviewOverlap)
        ]

        #Use the resulting filtered words as possible components of a phrase
        self.mPossiblePhraseWords = [
            word for (word, count) in self.mFilteredWords
        ]

        #Count of 2-gram occurences
        self.mTwoGramsCountMap = {}

        #Count of the number of reviews the 2-grams occur in
        self.mTwoGramsToReviewsMap = {}

        #Extract all 2-grams from the comments
        for itrComment, (tokComment, reviewId) in enumerate(
                self.mPartOfSpeechTokenizedCommentsAndReviewId):
            logging.getLogger("Context").info("Processing (2-grams) comment " +
                                              str(itrComment) + " of " +
                                              str(len(self.mRawCsvComments)))

            #Keeps track of the previous word scanned
            prevWord = "$"
            for itr, (word, partOfSpeech) in enumerate(tokComment):

                # Determine if part of speech is noun or adjective
                if (MinerMiscUtils.isAdj(partOfSpeech)
                        or MinerMiscUtils.isNoun(partOfSpeech)):

                    # Obtain stemmed word
                    stemmedWord = stemmer.stem(word)

                    # Increment stemmed 2-gram counts
                    if (stemmedWord in self.mPossiblePhraseWords
                            or prevWord in self.mPossiblePhraseWords):
                        phrase1 = prevWord + " " + stemmedWord
                        phrase2 = stemmedWord + " " + prevWord
                        defaultPhrase = phrase1

                        if (phrase2 in self.mTwoGramsCountMap.keys()):
                            defaultPhrase = phrase2
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(
                                set(reviewId))
                        elif (phrase1 in self.mTwoGramsCountMap.keys()):
                            self.mTwoGramsCountMap[defaultPhrase] += 1
                            self.mTwoGramsToReviewsMap[defaultPhrase].update(
                                set(reviewId))
                        else:
                            self.mTwoGramsCountMap[defaultPhrase] = 1
                            self.mTwoGramsToReviewsMap[defaultPhrase] = set(
                                reviewId)

                    prevWord = stemmedWord

        #Extract all 2-grams that occur frequently enough across reviews to care about and add them to the set of "filtered words"
        #TODO: There should really be a separate collection for 2-grams
        for twoGram in self.mTwoGramsCountMap.keys():
            if (float(len(self.mTwoGramsToReviewsMap[twoGram]))) / float(
                    len(self.mReviewIds)) > (filterWordReviewOverlap *
                                             filterWordReviewOverlap):
                self.mFilteredWords.append(
                    (twoGram, self.mTwoGramsCountMap[twoGram]))

        self.printFilteredWords()