def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap):
    ctx = False
    if cacheFileName == None or MinerMiscUtils.fileExists(cacheFileName) == False:
        ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews, filterWordReviewOverlap)
        if cacheFileName != None:
            pickle.dump(ctx, open(cacheFileName, "wb"))
    else:
        logging.getLogger("Context").info("Found context cache: " + cacheFileName)
        ctx = pickle.load(open(cacheFileName))
    return ctx
def CAR_conditional_apriori(ctx, featuresMaps, cacheFileName, minSup=0.1, minConf=0.5):
    logging.getLogger("CAR").info("conditional apriori")
    # See if cache exists
    if MinerMiscUtils.fileExists(cacheFileName) == False:
        CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf)
    else:
        logging.getLogger("CAR").info("Found cache: " + cacheFileName)

    FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName))
    return FHistFlattenedFeaturesMapPair
Beispiel #3
0
def CAR_conditional_apriori(ctx,
                            featuresMaps,
                            cacheFileName,
                            minSup=0.1,
                            minConf=0.5):
    logging.getLogger("CAR").info("conditional apriori")
    # See if cache exists
    if (MinerMiscUtils.fileExists(cacheFileName) == False):
        CAR_apriori(ctx, featuresMaps, cacheFileName, minSup, minConf)
    else:
        logging.getLogger("CAR").info("Found cache: " + cacheFileName)

    FHistFlattenedFeaturesMapPair = pickle.load(open(cacheFileName))
    return FHistFlattenedFeaturesMapPair
Beispiel #4
0
def loadContext(cacheFileName, strPathToRawCsvComments, strPathToRawCsvReviews,
                filterWordReviewOverlap):
    ctx = False
    if (cacheFileName == None
            or MinerMiscUtils.fileExists(cacheFileName) == False):
        ctx = Context(strPathToRawCsvComments, strPathToRawCsvReviews,
                      filterWordReviewOverlap)
        if (cacheFileName != None):
            pickle.dump(ctx, open(cacheFileName, "wb"))
    else:
        logging.getLogger("Context").info("Found context cache: " +
                                          cacheFileName)
        ctx = pickle.load(open(cacheFileName))
    return ctx
def addFeaturesDist( ctx, outFeaturesMaps ):
    logging.getLogger("Features").info( "Distance" )

    # Centroids for each class label
    centroids = [{}, {}]    
    centroidsCacheFileName = "centroidsCache.txt"
    if ( MinerMiscUtils.fileExists(centroidsCacheFileName)):
        # Load from cache!
        centroids = pickle.load( open( centroidsCacheFileName ) )
    else:
        # Sum up all features vectors
        for itrComment, rawCsvCommentDict in enumerate( ctx.mRawCsvComments ):
            label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
            for key, value in outFeaturesMaps[ itrComment ].iteritems():
                if ( type( value ) is str ):
                    print "BREAK = " + key + " = " + value + "\n"
                if ( key in centroids[label]):
                    centroids[label][key] += value
                else:
                    centroids[label][key] = value
                
                for altLabel in range( len(centroids ) ):
                    if ( altLabel != label ):
                        if key not in centroids[ altLabel ]:
                            centroids[ altLabel ][key] = 0.0
        
        # Average the centroids
        for centroid in centroids:
            for key, value in centroid.iteritems():
                value /= len( outFeaturesMaps )
                centroid[ key ] = value
        
        # Cache the centroids to disk
        pickle.dump( centroids, open( centroidsCacheFileName, "wb" ) )
        
    # Determine distance from both centroids
    distances = [ [], [] ]
    averageDistance = [ 0.0, 0.0 ]    
    for featuresMap in outFeaturesMaps:
        for label, centroid in enumerate(centroids):
            totalSqDist = 0.0
            for centroidKey, centroidValue in centroid.iteritems():
                commentValue = 0.0
                if ( centroidKey in featuresMap ):
                    commentValue = featuresMap[ centroidKey ]
                sqDist = commentValue - centroidValue
                sqDist *= sqDist
                totalSqDist += sqDist
            totalDist = math.sqrt( totalSqDist )
            distances[label].append( totalDist )
            averageDistance[label] += totalDist 
    
    for label in range( len( averageDistance ) ):
        averageDistance[ label ] /= len( outFeaturesMaps )
    
    # Determine standard deviation
    averageStdDev = [ 0, 0 ]
    for label, labelDistances in enumerate( distances ):
        for distance in labelDistances:
            sqDistFromMean = distance - averageDistance[ label ]
            sqDistFromMean *= sqDistFromMean
            averageStdDev[ label ] += sqDistFromMean
    
    for label in range( len( averageStdDev ) ):
        averageStdDev[ label ] /= len( outFeaturesMaps )
        averageStdDev[ label ]  = math.sqrt( averageStdDev[ label ] )
        
    # Map all feature vectors as being closer or farther from std dev
    for itrComment, featuresMap in enumerate(outFeaturesMaps):
        for label, stdDev in enumerate( averageStdDev ):
            featuresMap[ "Dist--"+str(label) ] = distances[ label ][ itrComment ] > stdDev
Beispiel #6
0
def addFeaturesDist(ctx, outFeaturesMaps):
    logging.getLogger("Features").info("Distance")

    # Centroids for each class label
    centroids = [{}, {}]
    centroidsCacheFileName = "centroidsCache.txt"
    if (MinerMiscUtils.fileExists(centroidsCacheFileName)):
        # Load from cache!
        centroids = pickle.load(open(centroidsCacheFileName))
    else:
        # Sum up all features vectors
        for itrComment, rawCsvCommentDict in enumerate(ctx.mRawCsvComments):
            label = MinerMiscUtils.getCommentLabel(rawCsvCommentDict)
            for key, value in outFeaturesMaps[itrComment].iteritems():
                if (type(value) is str):
                    print "BREAK = " + key + " = " + value + "\n"
                if (key in centroids[label]):
                    centroids[label][key] += value
                else:
                    centroids[label][key] = value

                for altLabel in range(len(centroids)):
                    if (altLabel != label):
                        if key not in centroids[altLabel]:
                            centroids[altLabel][key] = 0.0

        # Average the centroids
        for centroid in centroids:
            for key, value in centroid.iteritems():
                value /= len(outFeaturesMaps)
                centroid[key] = value

        # Cache the centroids to disk
        pickle.dump(centroids, open(centroidsCacheFileName, "wb"))

    # Determine distance from both centroids
    distances = [[], []]
    averageDistance = [0.0, 0.0]
    for featuresMap in outFeaturesMaps:
        for label, centroid in enumerate(centroids):
            totalSqDist = 0.0
            for centroidKey, centroidValue in centroid.iteritems():
                commentValue = 0.0
                if (centroidKey in featuresMap):
                    commentValue = featuresMap[centroidKey]
                sqDist = commentValue - centroidValue
                sqDist *= sqDist
                totalSqDist += sqDist
            totalDist = math.sqrt(totalSqDist)
            distances[label].append(totalDist)
            averageDistance[label] += totalDist

    for label in range(len(averageDistance)):
        averageDistance[label] /= len(outFeaturesMaps)

    # Determine standard deviation
    averageStdDev = [0, 0]
    for label, labelDistances in enumerate(distances):
        for distance in labelDistances:
            sqDistFromMean = distance - averageDistance[label]
            sqDistFromMean *= sqDistFromMean
            averageStdDev[label] += sqDistFromMean

    for label in range(len(averageStdDev)):
        averageStdDev[label] /= len(outFeaturesMaps)
        averageStdDev[label] = math.sqrt(averageStdDev[label])

    # Map all feature vectors as being closer or farther from std dev
    for itrComment, featuresMap in enumerate(outFeaturesMaps):
        for label, stdDev in enumerate(averageStdDev):
            featuresMap["Dist--" +
                        str(label)] = distances[label][itrComment] > stdDev