def SvmGetClassifierInputs(ctx, featuresMaps, outClassifierInputs): logging.getLogger("Svm").info("get classifier inputs") outClassifierInputs[:] = [] featuresKeys = set() for featureVector in featuresMaps: featuresKeys.update(featureVector.keys()) #featuresKeys = featuresMaps[0].keys() # Assuming at least a single features map exists for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments): # @TODO: Classify "Thumbs Down!" svmType = -1 + 2 * MinerMiscUtils.getCommentLabel(rawCsvCommentDict) inputsCollector = [SvmUtilGetStrSign(svmType) + str(svmType)] for itrFeature, featureKey in enumerate(featuresKeys): if (featureKey in featuresMaps[itrComment]): featureValue = -1 + 2 * int( featuresMaps[itrComment][featureKey]) else: featureValue = 0 inputsCollector.append(" " + str(itrFeature + 1) + ":" + str(featureValue)) outClassifierInputs.append("".join(inputsCollector)) assert (len(outClassifierInputs) == len(ctx.mRawCsvComments))
def CAR_get_comment_labels(ctx): logging.getLogger("CAR").info("get comment labels") outCommentLabels = [] for rawCsvCommentDict in ctx.mRawCsvComments: outCommentLabels.append( MinerMiscUtils.getCommentLabel(rawCsvCommentDict)) return outCommentLabels
def NaiveBayesGetClassifierInputs( ctx, featuresMaps, outClassifierInputs, bTrain ): logging.getLogger("NaiveBayes").info( "get classifier inputs" ) outClassifierInputs[:] = [] for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ): strLabel = "?" if ( bTrain ): strLabel = str(MinerMiscUtils.getCommentLabel(rawCsvCommentDict)) outClassifierInputs.append( ( featuresMaps[ itrComment ], strLabel ) )
def NaiveBayesGetClassifierInputs(ctx, featuresMaps, outClassifierInputs, bTrain): logging.getLogger("NaiveBayes").info("get classifier inputs") outClassifierInputs[:] = [] for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments): strLabel = "?" if (bTrain): strLabel = str(MinerMiscUtils.getCommentLabel(rawCsvCommentDict)) outClassifierInputs.append((featuresMaps[itrComment], strLabel))
def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap): ctx = False if cacheFileName == None or MinerMiscUtils.fileExists(cacheFileName) == False: ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap) if cacheFileName != None: pickle.dump(ctx, open(cacheFileName, "wb")) else: logging.getLogger("Context").info("Found context cache: " + cacheFileName) ctx = pickle.load(open(cacheFileName)) return ctx
def CAR_conditional_apriori(ctx, featuresMaps, cacheFileName, minSup=0.1, minConf=0.5): logging.getLogger("CAR").info("conditional apriori") # See if cache exists if MinerMiscUtils.fileExists(cacheFileName) == False: CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf) else: logging.getLogger("CAR").info("Found cache: " + cacheFileName) FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName)) return FHistFlattenedFeaturesMapPair
def addFeaturesPhrases( ctx, outFeaturesMaps ): logging.getLogger("Features").info( "phrases" ) rawFilteredWords = [ word for ( word, count ) in ctx.mFilteredWords ] for itrComment, phrases in enumerate(ctx.mCommentPhrases): for phrase in phrases: prevWord='$' for itrWord, (word, partOfSpeech) in enumerate( phrase ): if ( MinerMiscUtils.isAdj( partOfSpeech ) or MinerMiscUtils.isNoun( partOfSpeech ) ): stemmedWord = word phrase1 = prevWord + " " + stemmedWord phrase2 = stemmedWord + " " + prevWord if(phrase1 in rawFilteredWords): #print "Adding:" + phrase1 outFeaturesMaps[ itrComment ][ phrase1 ] = 1 elif(phrase2 in rawFilteredWords): #print "Adding:" + phrase1 outFeaturesMaps[ itrComment ][ phrase2 ] = 1 prevWord = stemmedWord
def addFeaturesPhrases(ctx, outFeaturesMaps): logging.getLogger("Features").info("phrases") rawFilteredWords = [word for (word, count) in ctx.mFilteredWords] for itrComment, phrases in enumerate(ctx.mCommentPhrases): for phrase in phrases: prevWord = '$' for itrWord, (word, partOfSpeech) in enumerate(phrase): if (MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech)): stemmedWord = word phrase1 = prevWord + " " + stemmedWord phrase2 = stemmedWord + " " + prevWord if (phrase1 in rawFilteredWords): #print "Adding:" + phrase1 outFeaturesMaps[itrComment][phrase1] = 1 elif (phrase2 in rawFilteredWords): #print "Adding:" + phrase1 outFeaturesMaps[itrComment][phrase2] = 1 prevWord = stemmedWord
def CAR_conditional_apriori(ctx, featuresMaps, cacheFileName, minSup=0.1, minConf=0.5): logging.getLogger("CAR").info("conditional apriori") # See if cache exists if (MinerMiscUtils.fileExists(cacheFileName) == False): CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf) else: logging.getLogger("CAR").info("Found cache: " + cacheFileName) FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName)) return FHistFlattenedFeaturesMapPair
def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap): ctx = False if (cacheFileName == None or MinerMiscUtils.fileExists(cacheFileName) == False): ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap) if (cacheFileName != None): pickle.dump(ctx, open(cacheFileName, "wb")) else: logging.getLogger("Context").info("Found context cache: " + cacheFileName) ctx = pickle.load(open(cacheFileName)) return ctx
def SvmGetClassifierInputs( ctx, featuresMaps, outClassifierInputs ): logging.getLogger("Svm").info( "get classifier inputs" ) outClassifierInputs[:] = [] featuresKeys = set() for featureVector in featuresMaps: featuresKeys.update(featureVector.keys()) #featuresKeys = featuresMaps[0].keys() # Assuming at least a single features map exists for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ): # @TODO: Classify "Thumbs Down!" svmType = -1 + 2 * MinerMiscUtils.getCommentLabel( rawCsvCommentDict ) inputsCollector = [SvmUtilGetStrSign( svmType ) + str(svmType)] for itrFeature, featureKey in enumerate( featuresKeys ): if(featureKey in featuresMaps[itrComment]): featureValue = -1 + 2*int(featuresMaps[itrComment][ featureKey ]) else: featureValue=0 inputsCollector.append( " " + str( itrFeature+1 ) + ":" + str(featureValue) ) outClassifierInputs.append( "".join( inputsCollector ) ) assert( len( outClassifierInputs ) == len( ctx.mRawCsvComments ) )
def addFeaturesDist( ctx, outFeaturesMaps ): logging.getLogger("Features").info( "Distance" ) # Centroids for each class label centroids = [{}, {}] centroidsCacheFileName = "centroidsCache.txt" if ( MinerMiscUtils.fileExists(centroidsCacheFileName)): # Load from cache! centroids = pickle.load( open( centroidsCacheFileName ) ) else: # Sum up all features vectors for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ): label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict) for key, value in outFeaturesMaps[ itrComment ].iteritems(): if ( type( value ) is str ): print "BREAK = " + key + " = " + value + "\n" if ( key in centroids[label]): centroids[label][key] += value else: centroids[label][key] = value for altLabel in range( len(centroids ) ): if ( altLabel != label ): if key not in centroids[ altLabel ]: centroids[ altLabel ][key] = 0.0 # Average the centroids for centroid in centroids: for key, value in centroid.iteritems(): value /= len( outFeaturesMaps ) centroid[ key ] = value # Cache the centroids to disk pickle.dump( centroids, open( centroidsCacheFileName, "wb" ) ) # Determine distance from both centroids distances = [ [], [] ] averageDistance = [ 0.0, 0.0 ] for featuresMap in outFeaturesMaps: for label, centroid in enumerate(centroids): totalSqDist = 0.0 for centroidKey, centroidValue in centroid.iteritems(): commentValue = 0.0 if ( centroidKey in featuresMap ): commentValue = featuresMap[ centroidKey ] sqDist = commentValue - centroidValue sqDist *= sqDist totalSqDist += sqDist totalDist = math.sqrt( totalSqDist ) distances[label].append( totalDist ) averageDistance[label] += totalDist for label in range( len( averageDistance ) ): averageDistance[ label ] /= len( outFeaturesMaps ) # Determine standard deviation averageStdDev = [ 0, 0 ] for label, labelDistances in enumerate( distances ): for distance in labelDistances: sqDistFromMean = distance - averageDistance[ label ] sqDistFromMean *= sqDistFromMean averageStdDev[ label ] += sqDistFromMean for label in range( len( averageStdDev ) ): averageStdDev[ label ] /= len( outFeaturesMaps ) averageStdDev[ label ] = math.sqrt( averageStdDev[ label ] ) # Map all feature vectors as being closer or farther from std dev for itrComment, featuresMap in enumerate(outFeaturesMaps): for label, stdDev in enumerate( averageStdDev ): featuresMap[ "Dist--"+str(label) ] = distances[ label ][ itrComment ] > stdDev
def __init__(self, mRawCsvComments, mRawCsvReviews, filterWordReviewOverlap): # Log parameters logging.getLogger("Context").info("Creating new context:") logging.getLogger("Context").info("filterWordReviewOverlap: " + str(filterWordReviewOverlap)) # Load stop words self.mStopWords = nltk.corpus.stopwords.words("english") self.mRawCsvReviews = mRawCsvReviews self.mRawCsvComments = mRawCsvComments # Parallel list of lower case comments with punctuation removed self.mLowerCasePunctRemovedComments = [] # Parallel list for storing [ (word, part-of-speech ) ] tuple lists for each comment self.mPartOfSpeechTokenizedComments = [] # Parallel list for storing [ stemmed(word) ] lists for each comment self.mStemmedTokenizedComments = [] # Set for storing unique review identifiers self.mReviewIds = set() # Maps a stemmed word to a set of reviews it belongs to (for filtering kindle, ipod, etc) self.mStemmedWordToReviewsMap = {} # Create stemmer for stemming words stemmer = PorterStemmer() # Dictionary for storing word counts of adjectives and nouns self.mAdjAndNounWordCountMap = {} # Dictionary for storing custom data specific to a classifier self.mCustomData = {} self.mPartOfSpeechTokenizedCommentsAndReviewId = [] self.mAuthorFreqPerReview = {} self.mAuthorReviewPerComment = [] self.mCommentPhrases = [] productCount = {} self.productAvgStars = {} for rawReview in self.mRawCsvReviews: if rawReview["Product"] not in self.productAvgStars.keys(): self.productAvgStars[rawReview["Product"]] = float(rawReview["Star Rating"]) productCount[rawReview["Product"]] = 1 else: self.productAvgStars[rawReview["Product"]] += float(rawReview["Star Rating"]) productCount[rawReview["Product"]] += 1 for key in self.productAvgStars.keys(): self.productAvgStars[key] = float(self.productAvgStars[key]) / float(productCount[key]) self.mReviewAuthorMap = {} self.mReviewStarMap = {} for rawReview in self.mRawCsvReviews: self.mReviewAuthorMap[rawReview["Review_ID"]] = rawReview["Author"] self.mReviewStarMap[rawReview["Review_ID"]] = rawReview["Star Rating"] # Convert to lower case, remove punctuation, and assign parts of speech, etc... for itrComment, rawCsvCommentDict in enumerate(self.mRawCsvComments): logging.getLogger("Context").info( "Processing (1-gram) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments)) ) # Extract review identifier reviewId = rawCsvCommentDict["Review_ID"] # Extract author of comment author = rawCsvCommentDict["Author"] if reviewId not in self.mAuthorFreqPerReview.keys(): self.mAuthorFreqPerReview[reviewId] = {} self.mAuthorFreqPerReview[reviewId][author] = 1 elif author not in self.mAuthorFreqPerReview[reviewId].keys(): self.mAuthorFreqPerReview[reviewId][author] = 1 else: self.mAuthorFreqPerReview[reviewId][author] += 1 self.mAuthorReviewPerComment.append((reviewId, author)) # Append any unique review identifiers self.mReviewIds.update([reviewId]) # Convert comment to lower case comment = rawCsvCommentDict["Comment"].lower() punctTokenizedComment = nltk.WordPunctTokenizer().tokenize(comment) phraseSeparators = [".", "?", "!", ";"] phrases = [] phrase = [] for word in punctTokenizedComment: if word in phraseSeparators: phrase = [phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords)] phrase = nltk.pos_tag(phrase) phrase = [(stemmer.stem(word), part) for (word, part) in phrase] phrases.append(phrase) phrase = [] else: phrase.append(word) if len(phrase) > 0: phrase = [phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords)] phrase = nltk.pos_tag(phrase) phrases.append(phrase) self.mCommentPhrases.append(phrases) # Replace punctuation with white space for punct in string.punctuation: comment = comment.replace(punct, " ") self.mLowerCasePunctRemovedComments.append(comment) # Tokenize into list of words tokenizedComment = nltk.word_tokenize(comment) # Filter out stop words tokenizedComment[:] = [word for word in tokenizedComment if (word not in self.mStopWords)] posTagComment = nltk.pos_tag(tokenizedComment) # Append a list of (word, part of speech) tuples self.mPartOfSpeechTokenizedComments.append(posTagComment) self.mPartOfSpeechTokenizedCommentsAndReviewId.append((posTagComment, reviewId)) # Append a list of stemmed words self.mStemmedTokenizedComments.append([]) self.mStemmedTokenizedComments[-1][:] = [stemmer.stem(word) for word in tokenizedComment] # Assert parallel lists are same length assert len(self.mPartOfSpeechTokenizedComments[-1]) == len(self.mStemmedTokenizedComments[-1]) # Determine word counts for nouns and adjectives for itr, (word, partOfSpeech) in enumerate(self.mPartOfSpeechTokenizedComments[-1]): # Determine if part of speech is noun or adjective if MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech): # Obtain stemmed word stemmedWord = self.mStemmedTokenizedComments[-1][itr] # Increment stemmed word counts if stemmedWord in self.mAdjAndNounWordCountMap: self.mAdjAndNounWordCountMap[stemmedWord] += 1 self.mStemmedWordToReviewsMap[stemmedWord].update([reviewId]) else: self.mAdjAndNounWordCountMap[stemmedWord] = 1 self.mStemmedWordToReviewsMap[stemmedWord] = set([reviewId]) # end inner for loop : iteration of (word, part of speech) tuples in single comment # end outer for loop : iteration over raw csv comment data # Assert parallel arrays are same length assert len(self.mRawCsvComments) == len(self.mLowerCasePunctRemovedComments) assert len(self.mLowerCasePunctRemovedComments) == len(self.mPartOfSpeechTokenizedComments) assert len(self.mPartOfSpeechTokenizedComments) == len(self.mStemmedTokenizedComments) # Set of words filtered by word counts: extract only words between threshold count ranges fGetWordReviewOverlap = lambda stemmedWord: float(len(self.mStemmedWordToReviewsMap[stemmedWord])) / float( len(self.mReviewIds) ) self.mFilteredWords = [ (word, count) for (word, count) in self.mAdjAndNounWordCountMap.iteritems() if (fGetWordReviewOverlap(word) > filterWordReviewOverlap) ] # Use the resulting filtered words as possible components of a phrase self.mPossiblePhraseWords = [word for (word, count) in self.mFilteredWords] # Count of 2-gram occurences self.mTwoGramsCountMap = {} # Count of the number of reviews the 2-grams occur in self.mTwoGramsToReviewsMap = {} # Extract all 2-grams from the comments for itrComment, (tokComment, reviewId) in enumerate(self.mPartOfSpeechTokenizedCommentsAndReviewId): logging.getLogger("Context").info( "Processing (2-grams) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments)) ) # Keeps track of the previous word scanned prevWord = "$" for itr, (word, partOfSpeech) in enumerate(tokComment): # Determine if part of speech is noun or adjective if MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech): # Obtain stemmed word stemmedWord = stemmer.stem(word) # Increment stemmed 2-gram counts if stemmedWord in self.mPossiblePhraseWords or prevWord in self.mPossiblePhraseWords: phrase1 = prevWord + " " + stemmedWord phrase2 = stemmedWord + " " + prevWord defaultPhrase = phrase1 if phrase2 in self.mTwoGramsCountMap.keys(): defaultPhrase = phrase2 self.mTwoGramsCountMap[defaultPhrase] += 1 self.mTwoGramsToReviewsMap[defaultPhrase].update(set(reviewId)) elif phrase1 in self.mTwoGramsCountMap.keys(): self.mTwoGramsCountMap[defaultPhrase] += 1 self.mTwoGramsToReviewsMap[defaultPhrase].update(set(reviewId)) else: self.mTwoGramsCountMap[defaultPhrase] = 1 self.mTwoGramsToReviewsMap[defaultPhrase] = set(reviewId) prevWord = stemmedWord # Extract all 2-grams that occur frequently enough across reviews to care about and add them to the set of "filtered words" # TODO: There should really be a separate collection for 2-grams for twoGram in self.mTwoGramsCountMap.keys(): if (float(len(self.mTwoGramsToReviewsMap[twoGram]))) / float(len(self.mReviewIds)) > ( filterWordReviewOverlap * filterWordReviewOverlap ): self.mFilteredWords.append((twoGram, self.mTwoGramsCountMap[twoGram])) self.printFilteredWords()
def addFeaturesDist(ctx, outFeaturesMaps): logging.getLogger("Features").info("Distance") # Centroids for each class label centroids = [{}, {}] centroidsCacheFileName = "centroidsCache.txt" if (MinerMiscUtils.fileExists(centroidsCacheFileName)): # Load from cache! centroids = pickle.load(open(centroidsCacheFileName)) else: # Sum up all features vectors for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments): label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict) for key, value in outFeaturesMaps[itrComment].iteritems(): if (type(value) is str): print "BREAK = " + key + " = " + value + "\n" if (key in centroids[label]): centroids[label][key] += value else: centroids[label][key] = value for altLabel in range(len(centroids)): if (altLabel != label): if key not in centroids[altLabel]: centroids[altLabel][key] = 0.0 # Average the centroids for centroid in centroids: for key, value in centroid.iteritems(): value /= len(outFeaturesMaps) centroid[key] = value # Cache the centroids to disk pickle.dump(centroids, open(centroidsCacheFileName, "wb")) # Determine distance from both centroids distances = [[], []] averageDistance = [0.0, 0.0] for featuresMap in outFeaturesMaps: for label, centroid in enumerate(centroids): totalSqDist = 0.0 for centroidKey, centroidValue in centroid.iteritems(): commentValue = 0.0 if (centroidKey in featuresMap): commentValue = featuresMap[centroidKey] sqDist = commentValue - centroidValue sqDist *= sqDist totalSqDist += sqDist totalDist = math.sqrt(totalSqDist) distances[label].append(totalDist) averageDistance[label] += totalDist for label in range(len(averageDistance)): averageDistance[label] /= len(outFeaturesMaps) # Determine standard deviation averageStdDev = [0, 0] for label, labelDistances in enumerate(distances): for distance in labelDistances: sqDistFromMean = distance - averageDistance[label] sqDistFromMean *= sqDistFromMean averageStdDev[label] += sqDistFromMean for label in range(len(averageStdDev)): averageStdDev[label] /= len(outFeaturesMaps) averageStdDev[label] = math.sqrt(averageStdDev[label]) # Map all feature vectors as being closer or farther from std dev for itrComment, featuresMap in enumerate(outFeaturesMaps): for label, stdDev in enumerate(averageStdDev): featuresMap["Dist--" + str(label)] = distances[label][itrComment] > stdDev
def CAR_get_comment_labels(ctx): logging.getLogger("CAR").info("get comment labels") outCommentLabels = [] for rawCsvCommentDict in ctx.mRawCsvComments: outCommentLabels.append(MinerMiscUtils.getCommentLabel(rawCsvCommentDict)) return outCommentLabels
def __init__(self, mRawCsvComments, mRawCsvReviews, filterWordReviewOverlap): # Log parameters logging.getLogger("Context").info("Creating new context:") logging.getLogger("Context").info("filterWordReviewOverlap: " + str(filterWordReviewOverlap)) # Load stop words self.mStopWords = nltk.corpus.stopwords.words('english') self.mRawCsvReviews = mRawCsvReviews self.mRawCsvComments = mRawCsvComments # Parallel list of lower case comments with punctuation removed self.mLowerCasePunctRemovedComments = [] # Parallel list for storing [ (word, part-of-speech ) ] tuple lists for each comment self.mPartOfSpeechTokenizedComments = [] # Parallel list for storing [ stemmed(word) ] lists for each comment self.mStemmedTokenizedComments = [] # Set for storing unique review identifiers self.mReviewIds = set() # Maps a stemmed word to a set of reviews it belongs to (for filtering kindle, ipod, etc) self.mStemmedWordToReviewsMap = {} # Create stemmer for stemming words stemmer = PorterStemmer() # Dictionary for storing word counts of adjectives and nouns self.mAdjAndNounWordCountMap = {} # Dictionary for storing custom data specific to a classifier self.mCustomData = {} self.mPartOfSpeechTokenizedCommentsAndReviewId = [] self.mAuthorFreqPerReview = {} self.mAuthorReviewPerComment = [] self.mCommentPhrases = [] productCount = {} self.productAvgStars = {} for rawReview in self.mRawCsvReviews: if (rawReview["Product"] not in self.productAvgStars.keys()): self.productAvgStars[rawReview["Product"]] = float( rawReview["Star Rating"]) productCount[rawReview["Product"]] = 1 else: self.productAvgStars[rawReview["Product"]] += float( rawReview["Star Rating"]) productCount[rawReview["Product"]] += 1 for key in self.productAvgStars.keys(): self.productAvgStars[key] = float( self.productAvgStars[key]) / float(productCount[key]) self.mReviewAuthorMap = {} self.mReviewStarMap = {} for rawReview in self.mRawCsvReviews: self.mReviewAuthorMap[rawReview["Review_ID"]] = rawReview["Author"] self.mReviewStarMap[ rawReview["Review_ID"]] = rawReview["Star Rating"] # Convert to lower case, remove punctuation, and assign parts of speech, etc... for itrComment, rawCsvCommentDict in enumerate(self.mRawCsvComments): logging.getLogger("Context").info("Processing (1-gram) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments))) # Extract review identifier reviewId = rawCsvCommentDict["Review_ID"] # Extract author of comment author = rawCsvCommentDict["Author"] if reviewId not in self.mAuthorFreqPerReview.keys(): self.mAuthorFreqPerReview[reviewId] = {} self.mAuthorFreqPerReview[reviewId][author] = 1 elif author not in self.mAuthorFreqPerReview[reviewId].keys(): self.mAuthorFreqPerReview[reviewId][author] = 1 else: self.mAuthorFreqPerReview[reviewId][author] += 1 self.mAuthorReviewPerComment.append((reviewId, author)) # Append any unique review identifiers self.mReviewIds.update([reviewId]) # Convert comment to lower case comment = rawCsvCommentDict["Comment"].lower() punctTokenizedComment = nltk.WordPunctTokenizer().tokenize(comment) phraseSeparators = ['.', '?', '!', ';'] phrases = [] phrase = [] for word in punctTokenizedComment: if word in phraseSeparators: phrase = [ phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords) ] phrase = nltk.pos_tag(phrase) phrase = [(stemmer.stem(word), part) for (word, part) in phrase] phrases.append(phrase) phrase = [] else: phrase.append(word) if len(phrase) > 0: phrase = [ phraseWord for phraseWord in phrase if (phraseWord not in self.mStopWords) ] phrase = nltk.pos_tag(phrase) phrases.append(phrase) self.mCommentPhrases.append(phrases) # Replace punctuation with white space for punct in string.punctuation: comment = comment.replace(punct, " ") self.mLowerCasePunctRemovedComments.append(comment) # Tokenize into list of words tokenizedComment = nltk.word_tokenize(comment) # Filter out stop words tokenizedComment[:] = [ word for word in tokenizedComment if (word not in self.mStopWords) ] posTagComment = nltk.pos_tag(tokenizedComment) # Append a list of (word, part of speech) tuples self.mPartOfSpeechTokenizedComments.append(posTagComment) self.mPartOfSpeechTokenizedCommentsAndReviewId.append( (posTagComment, reviewId)) # Append a list of stemmed words self.mStemmedTokenizedComments.append([]) self.mStemmedTokenizedComments[-1][:] = [ stemmer.stem(word) for word in tokenizedComment ] # Assert parallel lists are same length assert (len(self.mPartOfSpeechTokenizedComments[-1]) == len( self.mStemmedTokenizedComments[-1])) # Determine word counts for nouns and adjectives for itr, (word, partOfSpeech) in enumerate( self.mPartOfSpeechTokenizedComments[-1]): # Determine if part of speech is noun or adjective if (MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech)): # Obtain stemmed word stemmedWord = self.mStemmedTokenizedComments[-1][itr] # Increment stemmed word counts if (stemmedWord in self.mAdjAndNounWordCountMap): self.mAdjAndNounWordCountMap[stemmedWord] += 1 self.mStemmedWordToReviewsMap[stemmedWord].update( [reviewId]) else: self.mAdjAndNounWordCountMap[stemmedWord] = 1 self.mStemmedWordToReviewsMap[stemmedWord] = set( [reviewId]) # end inner for loop : iteration of (word, part of speech) tuples in single comment # end outer for loop : iteration over raw csv comment data # Assert parallel arrays are same length assert (len(self.mRawCsvComments) == len( self.mLowerCasePunctRemovedComments)) assert (len(self.mLowerCasePunctRemovedComments) == len( self.mPartOfSpeechTokenizedComments)) assert (len(self.mPartOfSpeechTokenizedComments) == len( self.mStemmedTokenizedComments)) # Set of words filtered by word counts: extract only words between threshold count ranges fGetWordReviewOverlap = lambda stemmedWord: float( len(self.mStemmedWordToReviewsMap[stemmedWord])) / float( len(self.mReviewIds)) self.mFilteredWords = [ (word, count) for (word, count) in self.mAdjAndNounWordCountMap.iteritems() if (fGetWordReviewOverlap(word) > filterWordReviewOverlap) ] #Use the resulting filtered words as possible components of a phrase self.mPossiblePhraseWords = [ word for (word, count) in self.mFilteredWords ] #Count of 2-gram occurences self.mTwoGramsCountMap = {} #Count of the number of reviews the 2-grams occur in self.mTwoGramsToReviewsMap = {} #Extract all 2-grams from the comments for itrComment, (tokComment, reviewId) in enumerate( self.mPartOfSpeechTokenizedCommentsAndReviewId): logging.getLogger("Context").info("Processing (2-grams) comment " + str(itrComment) + " of " + str(len(self.mRawCsvComments))) #Keeps track of the previous word scanned prevWord = "$" for itr, (word, partOfSpeech) in enumerate(tokComment): # Determine if part of speech is noun or adjective if (MinerMiscUtils.isAdj(partOfSpeech) or MinerMiscUtils.isNoun(partOfSpeech)): # Obtain stemmed word stemmedWord = stemmer.stem(word) # Increment stemmed 2-gram counts if (stemmedWord in self.mPossiblePhraseWords or prevWord in self.mPossiblePhraseWords): phrase1 = prevWord + " " + stemmedWord phrase2 = stemmedWord + " " + prevWord defaultPhrase = phrase1 if (phrase2 in self.mTwoGramsCountMap.keys()): defaultPhrase = phrase2 self.mTwoGramsCountMap[defaultPhrase] += 1 self.mTwoGramsToReviewsMap[defaultPhrase].update( set(reviewId)) elif (phrase1 in self.mTwoGramsCountMap.keys()): self.mTwoGramsCountMap[defaultPhrase] += 1 self.mTwoGramsToReviewsMap[defaultPhrase].update( set(reviewId)) else: self.mTwoGramsCountMap[defaultPhrase] = 1 self.mTwoGramsToReviewsMap[defaultPhrase] = set( reviewId) prevWord = stemmedWord #Extract all 2-grams that occur frequently enough across reviews to care about and add them to the set of "filtered words" #TODO: There should really be a separate collection for 2-grams for twoGram in self.mTwoGramsCountMap.keys(): if (float(len(self.mTwoGramsToReviewsMap[twoGram]))) / float( len(self.mReviewIds)) > (filterWordReviewOverlap * filterWordReviewOverlap): self.mFilteredWords.append( (twoGram, self.mTwoGramsCountMap[twoGram])) self.printFilteredWords()