def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap): ctx = False if cacheFileName == None or MinerMiscUtils.fileExists(cacheFileName) == False: ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap) if cacheFileName != None: pickle.dump(ctx, open(cacheFileName, "wb")) else: logging.getLogger("Context").info("Found context cache: " + cacheFileName) ctx = pickle.load(open(cacheFileName)) return ctx
def CAR_conditional_apriori(ctx, featuresMaps, cacheFileName, minSup=0.1, minConf=0.5): logging.getLogger("CAR").info("conditional apriori") # See if cache exists if MinerMiscUtils.fileExists(cacheFileName) == False: CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf) else: logging.getLogger("CAR").info("Found cache: " + cacheFileName) FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName)) return FHistFlattenedFeaturesMapPair
def CAR_conditional_apriori(ctx, featuresMaps, cacheFileName, minSup=0.1, minConf=0.5): logging.getLogger("CAR").info("conditional apriori") # See if cache exists if (MinerMiscUtils.fileExists(cacheFileName) == False): CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf) else: logging.getLogger("CAR").info("Found cache: " + cacheFileName) FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName)) return FHistFlattenedFeaturesMapPair
def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap): ctx = False if (cacheFileName == None or MinerMiscUtils.fileExists(cacheFileName) == False): ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap) if (cacheFileName != None): pickle.dump(ctx, open(cacheFileName, "wb")) else: logging.getLogger("Context").info("Found context cache: " + cacheFileName) ctx = pickle.load(open(cacheFileName)) return ctx
def addFeaturesDist( ctx, outFeaturesMaps ): logging.getLogger("Features").info( "Distance" ) # Centroids for each class label centroids = [{}, {}] centroidsCacheFileName = "centroidsCache.txt" if ( MinerMiscUtils.fileExists(centroidsCacheFileName)): # Load from cache! centroids = pickle.load( open( centroidsCacheFileName ) ) else: # Sum up all features vectors for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ): label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict) for key, value in outFeaturesMaps[ itrComment ].iteritems(): if ( type( value ) is str ): print "BREAK = " + key + " = " + value + "\n" if ( key in centroids[label]): centroids[label][key] += value else: centroids[label][key] = value for altLabel in range( len(centroids ) ): if ( altLabel != label ): if key not in centroids[ altLabel ]: centroids[ altLabel ][key] = 0.0 # Average the centroids for centroid in centroids: for key, value in centroid.iteritems(): value /= len( outFeaturesMaps ) centroid[ key ] = value # Cache the centroids to disk pickle.dump( centroids, open( centroidsCacheFileName, "wb" ) ) # Determine distance from both centroids distances = [ [], [] ] averageDistance = [ 0.0, 0.0 ] for featuresMap in outFeaturesMaps: for label, centroid in enumerate(centroids): totalSqDist = 0.0 for centroidKey, centroidValue in centroid.iteritems(): commentValue = 0.0 if ( centroidKey in featuresMap ): commentValue = featuresMap[ centroidKey ] sqDist = commentValue - centroidValue sqDist *= sqDist totalSqDist += sqDist totalDist = math.sqrt( totalSqDist ) distances[label].append( totalDist ) averageDistance[label] += totalDist for label in range( len( averageDistance ) ): averageDistance[ label ] /= len( outFeaturesMaps ) # Determine standard deviation averageStdDev = [ 0, 0 ] for label, labelDistances in enumerate( distances ): for distance in labelDistances: sqDistFromMean = distance - averageDistance[ label ] sqDistFromMean *= sqDistFromMean averageStdDev[ label ] += sqDistFromMean for label in range( len( averageStdDev ) ): averageStdDev[ label ] /= len( outFeaturesMaps ) averageStdDev[ label ] = math.sqrt( averageStdDev[ label ] ) # Map all feature vectors as being closer or farther from std dev for itrComment, featuresMap in enumerate(outFeaturesMaps): for label, stdDev in enumerate( averageStdDev ): featuresMap[ "Dist--"+str(label) ] = distances[ label ][ itrComment ] > stdDev
def addFeaturesDist(ctx, outFeaturesMaps): logging.getLogger("Features").info("Distance") # Centroids for each class label centroids = [{}, {}] centroidsCacheFileName = "centroidsCache.txt" if (MinerMiscUtils.fileExists(centroidsCacheFileName)): # Load from cache! centroids = pickle.load(open(centroidsCacheFileName)) else: # Sum up all features vectors for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments): label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict) for key, value in outFeaturesMaps[itrComment].iteritems(): if (type(value) is str): print "BREAK = " + key + " = " + value + "\n" if (key in centroids[label]): centroids[label][key] += value else: centroids[label][key] = value for altLabel in range(len(centroids)): if (altLabel != label): if key not in centroids[altLabel]: centroids[altLabel][key] = 0.0 # Average the centroids for centroid in centroids: for key, value in centroid.iteritems(): value /= len(outFeaturesMaps) centroid[key] = value # Cache the centroids to disk pickle.dump(centroids, open(centroidsCacheFileName, "wb")) # Determine distance from both centroids distances = [[], []] averageDistance = [0.0, 0.0] for featuresMap in outFeaturesMaps: for label, centroid in enumerate(centroids): totalSqDist = 0.0 for centroidKey, centroidValue in centroid.iteritems(): commentValue = 0.0 if (centroidKey in featuresMap): commentValue = featuresMap[centroidKey] sqDist = commentValue - centroidValue sqDist *= sqDist totalSqDist += sqDist totalDist = math.sqrt(totalSqDist) distances[label].append(totalDist) averageDistance[label] += totalDist for label in range(len(averageDistance)): averageDistance[label] /= len(outFeaturesMaps) # Determine standard deviation averageStdDev = [0, 0] for label, labelDistances in enumerate(distances): for distance in labelDistances: sqDistFromMean = distance - averageDistance[label] sqDistFromMean *= sqDistFromMean averageStdDev[label] += sqDistFromMean for label in range(len(averageStdDev)): averageStdDev[label] /= len(outFeaturesMaps) averageStdDev[label] = math.sqrt(averageStdDev[label]) # Map all feature vectors as being closer or farther from std dev for itrComment, featuresMap in enumerate(outFeaturesMaps): for label, stdDev in enumerate(averageStdDev): featuresMap["Dist--" + str(label)] = distances[label][itrComment] > stdDev