def _plot(cls, ax, y, style=None, bw_method=None, ind=None, column_num=None, stacking_id=None, **kwds): # 'y' is a Spark DataFrame that selects one column. # Using RDD is slow so we might have to change it to Dataset based implementation # once Spark has that implementation. sample = y.rdd.map(lambda x: float(x[0])) kd = KernelDensity() kd.setSample(sample) assert isinstance( bw_method, (int, float)), "'bw_method' must be set as a scalar number." if bw_method is not None: # Match the bandwidth with Spark. kd.setBandwidth(float(bw_method)) y = kd.estimate(list(map(float, ind))) lines = PandasMPLPlot._plot(ax, ind, y, style=style, **kwds) return lines
def kdensity_job(context): from pyspark.sql import Row from pyspark.mllib.stat import KernelDensity sql_ctx = context.sql_ctx df = sql_ctx.read.format('com.databricks.spark.csv') \ .option('header', 'true').option('inferSchema', 'true') \ .load('/Users/manikandan.nagarajan/Desktop/datasets/demo/forestfires.csv') rdd_column_data = df.rdd.map(lambda row_data: row_data[column]) kd = KernelDensity() kd.setSample(rdd_column_data) kd.setBandwidth(bandwidth) densities = kd.estimate(points_list) return densities
def compute_kde(sdf, bw_method=None, ind=None): # 'sdf' is a Spark DataFrame that selects one column. # Using RDD is slow so we might have to change it to Dataset based implementation # once Spark has that implementation. sample = sdf.rdd.map(lambda x: float(x[0])) kd = KernelDensity() kd.setSample(sample) assert isinstance(bw_method, (int, float)), "'bw_method' must be set as a scalar number." if bw_method is not None: # Match the bandwidth with Spark. kd.setBandwidth(float(bw_method)) return kd.estimate(list(map(float, ind)))
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.stat import KernelDensity conf = SparkConf().setAppName('Kernel density estimation').setMaster( 'local[2]') sc = SparkContext(conf=conf) data = sc.parallelize( [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]) kd = KernelDensity() kd.setSample(data) data_kd = kd.setBandwidth(3.0) densities = kd.estimate([-1.0, 2.0, 5.0]) print(densities) sc.stop()
def get_lr_curves( spark, features_df, cluster_ids, kernel_bandwidth, num_pdf_points, random_seed=None, ): """ Compute the likelihood ratio curves for clustered clients. Work-flow followed in this function is as follows: * Access the DataFrame including cluster numbers and features. * Load same similarity function that will be used in TAAR module. * Iterate through each cluster and compute in-cluster similarity. * Iterate through each cluster and compute out-cluster similarity. * Compute the kernel density estimate (KDE) per similarity score. * Linearly down-sample both PDFs to 1000 points. :param spark: the SparkSession object. :param features_df: the DataFrame containing the user features (e.g. the ones coming from |get_donors|). :param cluster_ids: the list of cluster ids (e.g. the one coming from |get_donors|). :param kernel_bandwidth: the kernel bandwidth used to estimate the kernel densities. :param num_pdf_points: the number of points to sample for the LR-curves. :param random_seed: the provided random seed (fixed in tests). :return: A list in the following format [(idx, (lr-numerator-for-idx, lr-denominator-for-idx)), (...), ...] """ # Instantiate holder lists for inter- and intra-cluster scores. same_cluster_scores_rdd = spark.sparkContext.emptyRDD() different_clusters_scores_rdd = spark.sparkContext.emptyRDD() random_split_kwargs = {"seed": random_seed} if random_seed else {} for cluster_number in cluster_ids: # Pick the features for users belonging to the current cluster. current_cluster_df = features_df.where( col("prediction") == cluster_number) # Pick the features for users belonging to all the other clusters. other_clusters_df = features_df.where( col("prediction") != cluster_number) logger.debug("Computing scores for cluster", extra={"cluster_id": cluster_number}) # Compares the similarity score between pairs of clients in the same cluster. cluster_half_1, cluster_half_2 = current_cluster_df.rdd.randomSplit( [0.5, 0.5], **random_split_kwargs) pair_rdd = generate_non_cartesian_pairs(cluster_half_1, cluster_half_2) intra_scores_rdd = pair_rdd.map(lambda r: similarity_function(*r)) same_cluster_scores_rdd = same_cluster_scores_rdd.union( intra_scores_rdd) # Compares the similarity score between pairs of clients in different clusters. pair_rdd = generate_non_cartesian_pairs(current_cluster_df.rdd, other_clusters_df.rdd) inter_scores_rdd = pair_rdd.map(lambda r: similarity_function(*r)) different_clusters_scores_rdd = different_clusters_scores_rdd.union( inter_scores_rdd) # Determine a range of observed similarity values linearly spaced. all_scores_rdd = same_cluster_scores_rdd.union( different_clusters_scores_rdd) stats = all_scores_rdd.aggregate(StatCounter(), StatCounter.merge, StatCounter.mergeStats) min_similarity = stats.minValue max_similarity = stats.maxValue lr_index = np.arange( min_similarity, max_similarity, float(abs(min_similarity - max_similarity)) / num_pdf_points, ) # Kernel density estimate for the inter-cluster comparison scores. kd_dc = KernelDensity() kd_dc.setSample(different_clusters_scores_rdd) kd_dc.setBandwidth(kernel_bandwidth) denominator_density = kd_dc.estimate(lr_index) # Kernel density estimate for the intra-cluster comparison scores. kd_sc = KernelDensity() kd_sc.setSample(same_cluster_scores_rdd) kd_sc.setBandwidth(kernel_bandwidth) numerator_density = kd_sc.estimate(lr_index) # Structure this in the correct output format. return list( zip(lr_index, list(zip(numerator_density, denominator_density))))
# value 'key0' is a dummy variable to make cartesian product NIR = hc.range(int(minNIR / abstol), int((maxNIR + abstol) / abstol), 1, 1) NIR = NIR.withColumn("Interest", psf.round(NIR.id * abstol, rnd)).withColumn( "key", psf.lit("key0")).select('key', 'Interest') # Create net of all possible commission percentages, rounded to desired abstol # value 'key0' is a dummy variable to make cartesian product Com = hc.range(int(minCom / abstol), int((maxCom + abstol) / abstol), 1, 1) Com = Com.withColumn("CommissionPct", psf.round(Com.id * abstol, rnd)).withColumn("key", psf.lit("key0")).select( 'key', 'CommissionPct') ######################################################################## ### New KDE kdNIR = KernelDensity() kdCom = KernelDensity() sampleNIR = MyTrain.select('Interest').rdd.map(lambda x: x[0]) sampleCom = MyTrain.select('CommissionPct').rdd.map(lambda x: x[0]) kdNIR.setSample(sampleNIR) kdCom.setSample(sampleCom) NIRs = NIR.select('Interest').rdd.map(lambda x: x[0]).collect() NIREst = kdNIR.estimate(NIRs).tolist() kdeNIR = hc.createDataFrame(sc.parallelize(zip(NIRs, NIREst)), ['Interest', 'kdeNIR']).coalesce(1).cache() Coms = Com.select('CommissionPct').rdd.map(lambda x: x[0]).collect() ComEst = kdCom.estimate(Coms).tolist() kdeCom = hc.createDataFrame(sc.parallelize(zip(Coms, ComEst)), ['CommissionPct', 'kdeCom']).coalesce(1).cache()
def findFeatures(inputFileName, outputFileName): inpFile = sc.textFile(inputFileName) numRows = inpFile.count() print('\nRead ', numRows, ' rows from ', inputFileName, '\n') print('Print out a few rows read from file') print('\n', inpFile.take(5), '\n') # Rectangularize the RDD before vectorizing # Filter elements to remove quotes to prevent (quote) embedded commas countFields = inpFile.map(lambda s: removeEmbeddedCommas(s)).map( lambda s: len(s.split(','))).collect() print('number of fields in each row (first few): ', countFields[0:4]) RectangularizationNeeded = False maxCount = 0 maxCountAt = 0 for i in range(len(countFields)): if (countFields[i] > maxCount): maxCount = countFields[i] maxCountAt = i if (i > 0) and (RectangularizationNeeded == False): if (countFields[i] != countFields[i - 1]): RectangularizationNeeded = True if (RectangularizationNeeded == True): print('Identified jagged data set; Rectangularization needed') else: print('Identified rectangular data set') print('Inferring longest row(s) has ', maxCount, ' fields at row ', maxCountAt) inpFileRe = inpFile.map(lambda s: removeEmbeddedCommas(s)).map( lambda s: s + ',No Data') # remove short rows shortFile = inpFileRe.filter( lambda row: len(row.split(',')) < maxCount + 1) print("Short rows will be filtered out") print('\n', shortFile.take(10), '\n') # truncate to maxCount+1 columns inpFileTr = inpFileRe.filter( lambda row: len(row.split(',')) == maxCount + 1) print('\n', inpFileTr.take(5), '\n') header = inpFileTr.first() hL = header.split(',') inpFileNh = inpFileTr.filter(lambda row: row != header) print('Removed the First row as Header') numRows = inpFileNh.count() print('number of rows = ', numRows) from pyspark.mllib.linalg import Matrix, Matrices from pyspark.mllib.linalg import Vector, Vectors # parsedData will be org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] parsedData = inpFileNh.map( lambda s: Vectors.dense([with0Str(t) for t in s.split(',')])) print('\nprint out a few vectors after converting from strings\n') print(parsedData.take(5)) from pyspark.mllib.stat import MultivariateStatisticalSummary, Statistics summary = Statistics.colStats(parsedData) print('\nprint out summary statistics, for each column\n') print('summary.mean') print(summary.mean()) print('summary.variance') print(summary.variance()) print('summary.count') print(summary.count()) print('summary.max') print(summary.max()) print('summary.min') print(summary.min()) print('summary.normL1') print(summary.normL1()) print('summary.normL2') print(summary.normL2()) print('summary.numnonZeros') print(summary.numNonzeros()) print() numCols = len(summary.mean()) typeStrings = [' '] * numCols # infer columns where normL1, normL2, mean, variance, max and mean are 0 as non-numeric print('Inferring column data types:') import math for j in range(numCols): if ((summary.normL1()[j] == 0.0) and (summary.normL2()[j] == 0.0) and (summary.mean()[j] == 0.0) and (summary.variance()[j] == 0.0) and (summary.max()[j] == 0.0) and (summary.min()[j] == 0.0)): typeStrings[j] = 'String' else: if ((math.trunc(summary.normL1()[j]) == summary.normL1()[j]) and (math.trunc(summary.max()[j]) == summary.max()[j]) and (math.trunc(summary.min()[j]) == summary.min()[j])): typeStrings[j] = 'Int' else: typeStrings[j] = 'Float' print(typeStrings[j], end=',') print('\n\n') #****************************************************************************** # take out the 'String' columns before calling Statistics.corr() numNumericCols = 0 for j in range(numCols): if (typeStrings[j] != 'String'): numNumericCols = numNumericCols + 1 noStrings = inpFileNh.map( lambda s: Vectors.dense(removeStrings(s, numNumericCols))) print(noStrings.take(5)) correlMatrix = Statistics.corr(noStrings, method='pearson') print('Computing Correlation Matrix on all columns') print( 'Printing out column names that have correlation coefficient > 0.5 or < -0.5' ) for i in range(numNumericCols): for j in range(i): if (((correlMatrix[i][j] >= 0.5) or (correlMatrix[i][j] <= -0.5)) and (i != j)): print(hA[i], hA[j], correlMatrix[i][j]) #****************************************************************************** #****************************************************************************** # create a contingency matrix LoLoF = [[0.0 for x in range(numNumericCols)] for y in range(numRows)] LoLoF = noStrings.collect() pdLinArr = [0.0 for x in range(numNumericCols * numRows)] for i in range(numRows): for j in range(numNumericCols): pdLinArr[i * numNumericCols + j] = abs(LoLoF[i][j]) mat = Matrices.dense(numRows, numNumericCols, pdLinArr) # conduct Pearson's independence test on the input contingency matrix print( "Computing Pearson's independence test on the input contingency matrix using chi-square test" ) independenceTestResult = Statistics.chiSqTest(mat) # summary of the test including the p-value, degrees of freedom print('%s\n' % independenceTestResult) #******************************************************************************* stdDev = [0.0] * numCols for j in range(numCols): stdDev[j] = math.sqrt(summary.variance()[j]) #******************************************************************************* # test for normal distribution using Kolmogorov-Smirnov test # colVec = [0.0] * numRows #vecRDD = sc.parallelize(colVec) #testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', 0, 1) #print(testResult) numericMean = [0.0] * numNumericCols numericSD = [0.0] * numNumericCols k = 0 for j in range(numCols): if ((summary.mean()[j] != 0.0) and (summary.variance()[j] != 0.0)): numericMean[k] = summary.mean()[j] numericSD[k] = stdDev[j] k = k + 1 print( 'Checking if column data is normally distributed using Kolmogorov-Smirnov test' ) for j in range(numNumericCols): for i in range(numRows): # see https://issues.apache.org/jira/browse/SPARK-20802 # test fails if data is normally distributed # kolmogorovSmirnovTest in pyspark.mllib.stat.Statistics throws net.razorvine.pickle.PickleException # when input data is normally distributed (no error when data is not normally distributed) colVec[i] = float(i) # LoLoF[i][j] vecRDD = sc.parallelize(colVec) print(colVec[0], colVec[numRows - 1], numericMean[j], numericSD[j]) testResult = Statistics.kolmogorovSmirnovTest(vecRDD, 'norm', numericMean[j], numericSD[j]) print(testResult) #******************************************************************************* #******************************************************************************* # # estimate kernel densities # from pyspark.mllib.stat import KernelDensity # colVec = [0.0]*numRows # vecRDD = sc.parallelize(colVec) print('Computing kernel densities on all columns using a Bandwidth of 3.0') kd = KernelDensity() kd.setSample(vecRDD) kd.setBandwidth(3.0) sAS = int(math.sqrt(numRows)) # sample array size samplePoints = [0.0] * sAS #samplePoints = [0.0]*numRows for i in range(sAS): samplePoints[i] = float(i * sAS) #for i in range(numRows): # samplePoints[i] = float(i) densities = kd.estimate(samplePoints) print('Estimating kernel densities') print('Print kernel densities at sample points') #print('Print kernel densities > 0.01 at sample points') for j in range(numNumericCols): # print( hL[j]) for i in range(numRows): # see https://issues.apache.org/jira/browse/SPARK-20803 # KernelDensity.estimate in pyspark.mllib.stat.KernelDensity throws # net.razorvine.pickle.PickleException when input data is normally # distributed (no error when data is not normally distributed) colVec[i] = float(i) # LoLoF[i][j] vecRDD = sc.parallelize(colVec) kd = KernelDensity() kd.setSample(vecRDD) kd.setBandwidth(3.0) # Find density estimates for the given values densities = kd.estimate(samplePoints) for i in range(sAS): print(densities[i], end=',') print() #for i in range(numRows): # if (densities[i] >= 0.01): # print(i, densities[i], end=',') print() #******************************************************************************* #******************************************************************************* # # compute Skewness and Kurtosis for each numeric column # skew = [0.0] * numNumericCols kurt = [0.0] * numNumericCols term = 0.0 k = 0 for j in range(numCols): if (typeStrings[j] != 'String'): skew[k] = 0.0 kurt[k] = 0.0 # extra work: find Ints typeStrings[j] = 'Int' meanj = summary.mean()[j] for i in range(numRows): if ((typeStrings[j] == 'Int') and (math.trunc(LoLoF[i][k]) != LoLoF[i][k])): typeStrings[j] = 'Float' term = (LoLoF[i][k] - meanj) / stdDev[j] skew[k] = skew[k] + (term * term * term) kurt[k] = kurt[k] + (term * term * term * term) skew[k] = skew[k] / numRows kurt[k] = (kurt[k] / numRows) - 3.0 k = k + 1 print('Skewness of columns') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(skew[k], end=',') k = k + 1 print() print('Kurtosis of columns') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(kurt[k], end=',') k = k + 1 print() print('Inferring column data types (Text string, Int, Float)') # numbers that are Int and non-negative and "large" are likely to be numeric labels -- keep checking this heuristic # columns that are outside Kurtosis limits <-1.2, 3.0> may be numeric labels print('Attempting to infer if an Int column is a numeric label') print("If all Ints in a column are >= 0 and 'large', it may be numLabel") print( 'If all Ints in a column are >= 0 and excess kurtosis is outside [-1.2, 3.0], it may be numLabel' ) for j in range(numCols): if ((typeStrings[j] == 'Int') and (summary.min()[j] >= 0) and ((summary.max()[j] > 10000) or (kurt[j] < -1.2) or (kurt[j] > 3.0))): print('column ' + j + ' (' + hA[j] + ') ' + ' may be a numeric label') typeStrings[j] = 'NumLabel' #****************************************************************************** #****************************************************************************** # # Normalize the dataset by shifting by mean and scaling by stdDev # normData = [[0.0 for x in range(numNumericCols)] for y in range(numRows)] rowMaxs = [0.0] * numRows rowMins = [0.0] * numRows rowNormL1s = [0.0] * numRows rowNormL2s = [0.0] * numRows rowNumZeros = [0] * numRows means = [0.0] * numCols for j in range(numCols): means[j] = summary.mean()[j] for i in range(numRows): rowMaxs[i] = -999999.0 rowMins[i] = 999999.0 rowNumZeros[i] = 0 rowNormL1s[i] = 0.0 rowNormL2s[i] = 0.0 k = 0 for j in range(numCols): if ((typeStrings[j] == 'Int') or (typeStrings[j] == 'Float')): normData[i][k] = (LoLoF[i][k] - means[j]) / stdDev[j] if (normData[i][k] > rowMaxs[i]): rowMaxs[i] = normData[i][k] if (normData[i][k] < rowMins[i]): rowMins[i] = normData[i][k] if (normData[i][k] == 0.0): rowNumZeros[i] = rowNumZeros if (abs(normData[i][k]) < 100.0): rowNormL1s[i] = rowNormL1s[i] + abs(normData[i][k]) rowNormL2s[ i] = rowNormL2s[i] + normData[i][k] * normData[i][k] # print(i,j,k, LoLoF[i][k], means[j], stdDev[j], normData[i][k], rowNormL1s[i], rowNormL2s[i]) k = k + 1 input = open(inputFileName, 'r') fileHandle = open('/home/bsrsharma/work/python/rowNormL1L2.csv', 'w') # Keep upto 6 columns of identifying info if (numCols > 1): for j in range(min(5, numCols)): fileHandle.write(hL[j]) fileHandle.write(',') fileHandle.write('L1-Norm') fileHandle.write(",") fileHandle.write('L2-Norm\n') s = input.readline() # don't repeat header for i in range(numRows): # copy input to output s = input.readline() LoS = s.split(',') for j in range(min(5, numCols)): fileHandle.write(LoS[j]) fileHandle.write(',') fileHandle.write('%s' % rowNormL1s[i]) fileHandle.write(',') fileHandle.write('%s' % math.sqrt(rowNormL2s[i])) fileHandle.write('\n') fileHandle.close() input.close() print('Wrote ', 'rowNormL1L2.csv') input = open(inputFileName, 'r') fileHandle = open(outputFileName, 'w') # output normalized data numCols = numCols - 1 # write header row if (numCols > 1): for j in range(numCols - 1): fileHandle.write(hL[j]) fileHandle.write(',') fileHandle.write(hL[numCols - 1]) fileHandle.write('\n') s = input.readline() # don't repeat header for i in range(numRows): # copy input to output s = input.readline() LoS = s.split(',') k = 0 for j in range(numCols - 1): if (typeStrings[j] == 'String'): fileHandle.write(LoS[j]) else: fileHandle.write('%s' % normData[i][k]) k = k + 1 fileHandle.write(',') if (typeStrings[numCols - 1] == 'String'): fileHandle.write(LoS[numCols - 1]) else: fileHandle.write('%s' % normData[i][k]) fileHandle.write('\n') fileHandle.close() input.close() print('Wrote ', outputFileName, '\n') #****************************************************************************** # compute median for each column medians = [0.0] * numNumericCols aCol = [0.0] * numRows for j in range(numNumericCols): for i in range(numRows): aCol[i] = LoLoF[i][j] aCol.sort() medians[j] = aCol[numRows / 2] print('medians:') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: print(medians[k], end=',') k = k + 1 print('\n\n') # compute histograms for each column numBins = int(math.sqrt(numRows)) histogram = [0] * (numBins + 1) binWidth = 0 mins = [0.0] * numCols maxs = [0.0] * numCols print('Computing histograms for numeric columns') print('choosing ', numBins, ' bins') k = 0 for j in range(numCols): mins[j] = summary.min()[j] maxs[j] = summary.max()[j] if (typeStrings[j] == 'String'): print('column ', j, '( ', hL[j], ' ): Text') else: binWidth = (maxs[j] - mins[j]) / numBins for i in range(numBins): histogram[i] = 0 for i in range(numRows): histogram[int((LoLoF[i][k] - mins[j]) / binWidth)] += 1 print('column ', j, '( ', hL[j], ' ):') if (typeStrings[j] == 'NumLabel'): print('NumLabel') for i in range(numBins): print(histogram[i], end=',') print() k = k + 1 print('\n\n') # compute modes modes = [0.0] * numNumericCols largestBin = 0 binIndex = 0 print('modes:') k = 0 for j in range(numCols): if (typeStrings[j] == 'String'): print('Text', end=',') else: largestBin = 0 binIndex = 0 for i in range(numBins): # pick the bin with most items if (histogram[i] > largestBin): binIndex = i modes[k] = mins[j] + (maxs[j] - mins[j]) * binIndex / numBins print(modes[k], end=',') k = k + 1 print('\n\n') return 0
# limitations under the License. # from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import KernelDensity # $example off$ if __name__ == "__main__": sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext # $example on$ # an RDD of sample data data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]) # Construct the density estimator with the sample data and a standard deviation for the Gaussian # kernels kd = KernelDensity() kd.setSample(data) kd.setBandwidth(3.0) # Find density estimates for the given values densities = kd.estimate([-1.0, 2.0, 5.0]) # $example off$ print(densities) sc.stop()