def run4(self): from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label raw_data = self.raw_data vector_data = raw_data.map(parse_interaction) # Compute column summary statistics. summary = Statistics.colStats(vector_data) print "Duration Statistics:" print " Mean: {}".format(round(summary.mean()[0],3)) print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3)) print " Max value: {}".format(round(summary.max()[0],3)) print " Min value: {}".format(round(summary.min()[0],3)) print " Total value count: {}".format(summary.count()) print " Number of non-zero values: {}".format(summary.numNonzeros()[0]) label_vector_data = raw_data.map(parse_interaction_with_key) normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.") normal_summary = Statistics.colStats(normal_label_data.values()) print "Duration Statistics for label: {}".format("normal") print " Mean: {}".format(normal_summary.mean()[0],3) print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3)) print " Max value: {}".format(round(normal_summary.max()[0],3)) print " Min value: {}".format(round(normal_summary.min()[0],3)) print " Total value count: {}".format(normal_summary.count()) print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0]) normal_sum = summary_by_label(raw_data, "normal.") print "Duration Statistics for label: {}".format("normal") print " Mean: {}".format(normal_sum.mean()[0],3) print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3)) print " Max value: {}".format(round(normal_sum.max()[0],3)) print " Min value: {}".format(round(normal_sum.min()[0],3)) print " Total value count: {}".format(normal_sum.count()) print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0]) label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.", "imap.","ipsweep.","land.","loadmodule.","multihop.", "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.", "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.", "warezmaster."] stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list] duration_by_label = [ (stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())])) for stat in stats_by_label] pd.set_option('display.max_columns', 50) stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index') print "Duration statistics, by label" stats_by_label_df
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2())) data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x)) summary2 = Statistics.colStats(data2) self.assertEqual(array([45.0]), summary2.normL1()) import math expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10)))) self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
def test_col_with_different_rdds(self): # numpy data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(1000, summary.count()) # array data = self.sc.parallelize([range(10)] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count()) # array data = self.sc.parallelize([pyarray.array("d", range(10))] * 10) summary = Statistics.colStats(data) self.assertEqual(10, summary.count())
def do_all(f_path,out_name): sc = SparkContext() data = sc.textFile(f_path) data = data.map(parseKeepD).filter(lambda p: p[0] != None) # Scale Features features = data.map(lambda x: x[0].features) summary = Statistics.colStats(features) global means global varis means = summary.mean() varis = summary.variance() #scale the points data = data.map(lambda y: (conv_label_pt(y[0]),y[1])) #train model model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none') #calculate disparity disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1])) #calculate SSR for later ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum() #keep N N = disparity.count() #shut down SC MSE = ssr/float(N) se = std_errors(data,MSE,N) disparity.saveAsTextFile(out_loc + out_name) sc.stop() return model.intercept,model.weights,se,disparity, ssr, N
def generateFeatureClusters(context, geneExp, samples, headers, numClusters): # Ignore the first item (the diagnosis header) headers = headers[1:] # 1) Generate statistic data for each of the genes/entrez ids # Retrieve the mean, variance, max and min of each gene # The entrez id associate with each gene is the row index (matches to the headers index) cStats = Statistics.colStats(geneExp) print(len(cStats.mean())) data = np.array( [cStats.mean(), cStats.variance(), cStats.max(), cStats.min()]).transpose() # Create a stats array with the index as first column # e_id for e_id in headers dataWithIndex = np.array([[e_id for e_id in headers], cStats.mean(), cStats.variance(), cStats.max(), cStats.min()]).transpose() print(dataWithIndex.shape) # 2) Create dataframes that will be used to train KMeans # Create dataframe for the stats data (with no entrez ids) df = context.parallelize(data) # create dataframe for the stats data (with entrez ids) # Will be used to cluster features later dfWithIndex = context.parallelize(dataWithIndex) # 3) Train KMeans with statistic data # use the stats data to discover clusters for the genes model = KMeans.train(df, numClusters, maxIterations=100, initializationMode="random") # 4) save model model.save(context, './models/clusters') # 5) Label each feature with their cluster # For each gene statistic, map it to (prediction, e_id) clusterLabeledFeatures = dfWithIndex.map( lambda point: (model.predict(point[1:]), point[0])) featuresToCluster = dfWithIndex.map(lambda point: point[0], (model.predict(point[1:]))) # 6) Group together the features by their cluster label clusteredFeatures = clusterLabeledFeatures.groupByKey() #print(clusteredFeatures.count()) #print(clusteredFeatures.take(2)) cF = clusteredFeatures.collectAsMap() # 7) Transform the sample data to use the clusters samplesWithClusters = samples.map(lambda sample: updateSample(sample, cF)) return samplesWithClusters
def calculateStats(years2stats, op): result = dict() for year in years2stats: stats = sc.parallelize(years2stats[year]) summary = Statistics.colStats(stats) if op == 'mean': means = summary.mean() valuesList = [] for singleElement in means: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList if op == 'variance': variances = summary.variance() valuesList = [] for singleElement in variances: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList if op == 'max': maxValue = summary.max() valuesList = [] for singleElement in maxValue: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList if op == 'min': minValue = summary.min() valuesList = [] for singleElement in minValue: valuesList.append(str(singleElement).rstrip()) result[year] = valuesList return result
def summarize(dataset): print "schema: %s" % dataset.schema().json() labels = dataset.map(lambda r: r.label) print "label average: %f" % labels.mean() features = dataset.map(lambda r: r.features) summary = Statistics.colStats(features) print "features average: %r" % summary.mean()
def scriptJob(self, limit=None, rowstart=None, rowstop=None): start = datetime.datetime.now() # create hbase connection row = self.table.scan(row_start=rowstart, row_stop=rowstop, limit=limit, columns=self.columns) print(type(row)) testRdd = self.sc.parallelize(row) values = testRdd.values() print(values.count()) col = bytes(self.columns.encode("utf-8")) serilizeRdd = values.map(lambda value: float(value.get(col).decode())) # # def hash_domain(url): # return hash(urlparse.urlparse(url).netloc) mlibRDD = self.sc.parallelize( (([Vectors.dense(x)]) for x in serilizeRdd.collect())) cStats = Statistics.colStats(mlibRDD) # print(cStats.mean()) end = datetime.datetime.now() print(end - start) return cStats.mean()
def CorrelationFeature(vectors): matriz = sc.broadcast(Statistics.corr(vectors, method="pearson")) summary = Statistics.colStats(vectors) varianza = summary.variance() #########new heuristic diogo proposal w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = varianza[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) #features sorted index = [] for i in r: index.append(i[1]) index = index[0:6] #tacking the first 6 features return index
def info_paragraphs(df, clm): df = df.where(col(clm).isNotNull()) paragraphs = df.rdd.flatMap(lambda x: getattr(x, clm)).filter( lambda p: p != None) paragraphs = paragraphs.map(lambda p: np.array(len(p.split()))) summary = Statistics.colStats(paragraphs) return summary
def column_means(data: pyspark.rdd.RDD): """ Compute vectors of column means. ` :param data: an RDD :return: returns column means as vector """ logger.info("Computing data means") summary = Statistics.colStats(data) return summary.mean()
def column_statistics(data: pyspark.rdd.RDD): """ Compute vectors of column means and variances of a data frame. ` :param data: an RDD :return: returns column means and variances as vectors """ logger.info("Computing data statistics") summary = Statistics.colStats(data) return summary.mean(), summary.variance()
def average_vector(data): from pyspark.sql.functions import col vectors = data.select("vectors").where(col("vectors").isNotNull()) from pyspark.mllib.linalg import Vectors vectors_v = vectors.map(lambda line: Vectors.dense(line)) from pyspark.mllib.stat import Statistics summary = Statistics.colStats(vectors_v) mean = summary.mean() logger.info(mean) return mean
def CorrelationFeature(vectors): # print 'Calculation Correlation' matriz = sc.broadcast(Statistics.corr(vectors, method="pearson")) summary = Statistics.colStats(vectors) varianza = summary.variance() #########new heuristic diogo proposal w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = varianza[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) #features sorted #print r # print 'calculating features selections' #Old heuristic # # w={} # # for i in range(len(matriz)): # # w[i]=0 # # for j in np.nan_to_num(matriz[i]): # # k=abs(j) # # w[i]=w[i]+k # r=sorted([(value,key) for (key,value) in w.items()],reverse=True) #####"" #vectors=np.matrix(vectors) #beforeMatrix=vectors.map(lambda x: np.matrix(x)) index = [] for i in r: index.append(i[1]) index = index[0:6] #tacking the first 6 features #MatrixReducer(vectors,index) return index
def readDataFromES(): es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) results_gen = elasticsearch.helpers.scan( es, index='netflowrepo', doc_type='entry', query={"query": { "match_all": {} }}) results = list(results_gen) sumOfFlows_list = [] sumOfBytes_list = [] uniqDstIPs_list = [] uniqDstPorts_list = [] for row in results: sumOfFlows_list.append(row['_source']['sumOfFlows']) sumOfBytes_list.append(row['_source']['sumOfBytes']) uniqDstIPs_list.append(row['_source']['uniqDstIPs']) uniqDstPorts_list.append(row['_source']['uniqDstPorts']) # Convert data to numpy arrays. np_Flows = np.array(sumOfFlows_list) np_Bytes = np.array(sumOfBytes_list) np_DstIPs = np.array(uniqDstIPs_list) np_DstPorts = np.array(uniqDstPorts_list) # Convert data into Matrix. Each feature is in a column. tmp1 = np.concatenate((np_Flows.reshape((-1, 1)), np_Bytes.reshape( (-1, 1))), axis=1) tmp2 = np.concatenate((tmp1, np_DstIPs.reshape((-1, 1))), axis=1) tmp3 = np.concatenate((tmp2, np_DstPorts.reshape((-1, 1))), axis=1) mat = sc.parallelize(tmp3.tolist()) summary = Statistics.colStats(mat) print("count =", summary.count()) print("mean =", summary.mean()) print("min =", summary.min()) print("max =", summary.max()) print("variance =", summary.variance()) mean = summary.mean() max = summary.max() stddev = np.sqrt(summary.variance()) return (mean, max, stddev)
def cities_stats(city_rdd): pop_rdd = city_rdd.map(lambda city: [city[1]]) statistics = Statistics.colStats(pop_rdd) mean = statistics.mean()[0] variance = statistics.variance()[0] max_pop = statistics.max()[0] min_pop = statistics.min()[0] res = { "mean": mean, "variance": variance, "deviation": math.sqrt(variance), "max": max_pop, "min": min_pop } return res
def recommend2user(self,user_id): query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id) def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [] rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) historyTitleData = self.spark.createDataFrame(rd, table_cols) historyTitleData = historyTitleData.dropna() self.model.createOrReplaceTempView("Database") historyTitleData.registerTempTable("historyTable") pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''') mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray()) mainVec = Statistics.colStats(mainRdd).mean() pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray())))) pager = pageRank.toDF() pager.createOrReplaceTempView("pager") sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''') return sortPageR.take(10)
def CorrelationFeature(vectors, schema): print("Calculating Correlation") vectors_rdd = vectors.rdd.map( lambda row: Vectors.dense([x for x in row["features"]])) matriz = spark.sparkContext.broadcast( Statistics.corr(vectors_rdd, method="pearson")) summary = Statistics.colStats(vectors_rdd) variance = summary.variance() ######## HeurÃstica ######## w = {} aij = {} for i in range(len(matriz.value)): w[i] = 0 aij[i] = 0 for j in np.nan_to_num(matriz.value[i]): k = abs(j) aij[i] = aij[i] + k w[i] = variance[i] / aij[i] r = sorted([(value, key) for (key, value) in w.items()], reverse=True) index = r[0:6] a = [] for i in index: a.append((0, int(i[1]))) red = MatrixReducer(vectors_rdd, a, schema) return red
weights = Vectors.dense([0.2, 0.1, 0.1, 0.1, 0.5, 0.5, 0.7, 0.9, 1.0]) # instantiate an ElementWiseProduct object and initialize with the weights vector ep = ElementwiseProduct(weights) # transform vecrdd using the transform method of the ElementWiseProduct object # to create an RDD of weighted values # print the top line of each RDD to confirm that the transformation was successful weighted = ep.transform(vecrdd) print weighted.take(1) print vecrdd.take(1) # call the colStats method of the Statistics object on vecrdd and print the # mean, variance, and number of non-zero values stats = Statistics.colStats(vecrdd) print stats.mean() print stats.variance() print stats.numNonzeros() # instantiate a StandardScaler object and set withMean and withStd to 'True' ss = StandardScaler(withMean=True, withStd=True) # call the fit method of the StandardScaler object to create a StandardScalerModel model = ss.fit(vecrdd) # call the transform method of the StandardScalerModel to center and scale the data # in vecrdd RDD scaled = model.transform(vecrdd)
def summarize(dataset): labels = dataset.map(lambda r: r.label) print("label average: %f" % labels.mean()) features = dataset.map(lambda r: r.features) summary = Statistics.colStats(features) print("features average: %r" % summary.mean())
rddUSD = sc.textFile("dataUSDuprv.csv") rddUSD.persist() rddUSD.take(5) #deleting first row(header) header=rddUSD.first() dataLines = rddUSD.filter(lambda x: x != header) dataLines.count() dataLines.first() dataLines.take(5) #RDD to Dense vector vectorsUSD = dataLines.map(transformationDT.transformToNumeric) vectorsUSD.take(5) #Perform statistical Analysis statsUSD=Statistics.colStats(vectorsUSD) statsUSD.mean() statsUSD.variance() statsUSD.min() statsUSD.max() Statistics.corr(vectorsUSD) #SPARK SQL dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",") dataframe.registerTempTable("dataUSDuprv") dff1=sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show() dataframe.show() #LabeledPoint lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
# Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() print("Schema from LIBSVM:") df.printSchema() print("Loaded training data as a DataFrame with " + str(df.count()) + " records.") # Show statistical summary of labels. labelSummary = df.describe("label") labelSummary.show() # Convert features column to an RDD of vectors. features = MLUtils.convertVectorColumnsFromML(df, "features") \ .select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) # Save the records in a parquet file. tempdir = tempfile.NamedTemporaryFile(delete=False).name os.unlink(tempdir) print("Saving to " + tempdir + " as Parquet file.") df.write.parquet(tempdir) # Load the records back. print("Loading Parquet file with UDT from " + tempdir) newDF = spark.read.parquet(tempdir) print("Schema from Parquet:") newDF.printSchema() shutil.rmtree(tempdir)
#Load the CSV file into a RDD sc = SparkContext() sqlContext = SQLContext(sc) rddUSD = sc.textFile("../Forex DT/data/1440/USD1440.csv") rddUSD.cache() #Remove the first line header = rddUSD.first() dataLines = rddUSD.filter(lambda x: x != header) dataLines.take(5) usdVectors = dataLines.map(transformationLR.transformToNumeric) #Perform statistical Analysis usdStats = Statistics.colStats(usdVectors) usdStats.mean() usdStats.variance() usdStats.min() usdStats.max() Statistics.corr(usdVectors) #Transform to a Data Frame for input to Machine Learing #Drop columns that are not required (low correlation) usdLP = usdVectors.map(transformationLR.transformToLabeledPoint) usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"]) usdDF.select("label", "features").show(10) #Split into training and testing data (trainingData, testData) = usdDF.randomSplit([0.7, 0.3]) trainingData.count()
##### En trichant ##### # Utilisation de pandas pour résumer les données + afficher la matrice de corrélation df = pd.read_csv("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF+".csv", sep = ";",header=0) df.describe() # Matrice de corrélation # print(df.corr()) # ### Mllib Statistics # In[5]: from pyspark.mllib.stat import Statistics # Basics Statistics partsNum = parts.map(lambda line: line[0:8]) summary = Statistics.colStats(partsNum) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros()) Statistics.corr(partsNum, method="pearson") # # Classification supervisée # ## Naive Bayes # In[6]: from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel import utils_mesure nomF_svm = "glass_svm"
return np.array([float(x) for x in clean_line_split]) vector_data = raw_data.map(parse_interaction) # ## Summary statistics # Spark's MLlib provides column summary statistics for `RDD[Vector]` through the function [`colStats`](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics.colStats) available in [`Statistics`](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics). The method returns an instance of [`MultivariateStatisticalSummary`](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary), which contains the column-wise *max*, *min*, *mean*, *variance*, and *number of nonzeros*, as well as the *total count*. # In[4]: from pyspark.mllib.stat import Statistics from math import sqrt # Compute column summary statistics. summary = Statistics.colStats(vector_data) print("Duration Statistics:") print(" Mean: {}".format(round(summary.mean()[0], 3))) print(" St. deviation: {}".format(round(sqrt(summary.variance()[0]), 3))) print(" Max value: {}".format(round(summary.max()[0], 3))) print(" Min value: {}".format(round(summary.min()[0], 3))) print(" Total value count: {}".format(summary.count())) print(" Number of non-zero values: {}".format(summary.numNonzeros()[0])) # ### Summary statistics by label # The interesting part of summary statistics, in our case, comes from being able to obtain them by the type of network attack or 'label' in our dataset. By doing so we will be able to better characterise our dataset dependent variable in terms of the independent variables range of values. # If we want to do such a thing we could filter our RDD containing labels as keys and vectors as values. For that we just need to adapt our `parse_interaction` function to return a tuple with both elements.
maxDepth=5, maxBins=maxBins) ### Evalute # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\ .sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') # print(model.toDebugString()) ### Compute R2 SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1])).sum() summary = Statistics.colStats(testData.map(lambda x: Vectors.dense(x.label))) meanY = float(summary.mean()) # Alternative for mean # testData.map(lambda x: Vectors.dense(x.label)).mean() SST = testData.map(lambda y: (y.label - meanY)**2).sum() n = float(testData.count()) params = 3 Rsqrd = 1 - SSE / SST RsqrdAdj = 1 - SSE / (n - params) / (SST / (n - 1)) print('R-sqruared: {0}'.format(Rsqrd)) print('R-sqruared Adj: {0}'.format(RsqrdAdj))
d1.signal = scale(d1.signal, axis=1) reader.seek(j) d2 = reader.read() d2.signal = scale(d2.signal, axis=1) return i, j, exp(-euclidean(d1.signal[7], d2.signal[7]) ** 2) if __name__ == '__main__': counts = [] sc = pyspark.SparkContext() cchunks = list(chunk_combinations(0, 8192, 512)) start = time.time() fit_rdd = sc.parallelize(chunks(0, 8182, 512)).flatMap(chunk_fits).sortByKey() fit_rdd.saveAsTextFile('fit_rdd.txt') fit_rdd.cache() stats = Statistics.colStats(fit_rdd.values()) means = stats.mean() scales = np.sqrt(stats.variance()) fit_rdd = fit_rdd.mapValues(lambda x: (x - means) / scales) # values = fit_rdd.values() # values.cache() # km = KMeans().train(values, 3) # predictions = km.predict(values) # with open('predictions.txt', 'w') as f: # f.write('index,p0,p1,p2,p3,res,category') # for temp, pred in izip(fit_rdd.collect(), predictions.collect()): # key, value = temp # f.write('\n%i,%f,%f,%f,%f,%f,%i' % (key, value[0], value[1], value[2], value[3], value[4], pred)) # pass # print km.clusterCenters # rdd = sc.parallelize(cchunks)
# Create a Spark context and set it to work with SparkContext(conf=conf) as sc: # Read the parsed records. This time we are reading the serialized file. # So in each record the fields will already be split directory = "hdfs:///user/{0}/data/tvlogs/".format( sc.sparkUser() ) logs = sc.pickleFile( "{0}{1}".format(directory,name) ) # Group records by user byUser = logs.map( lambda x : (x[0],x[1:]) ).groupByKey() # Compute the time difference between consecutive records of each user intervals = byUser.flatMap( lambda (x,y) : time_intervals(y) ) # keep it for reusing intervals.cache() # Extract statistics from those time differences # Note that colStats needs a Vector (or a Python list), since it computes by column # In our case we have a 1-column list summary = Statistics.colStats(intervals) with open( 'interval-stats.txt', 'w' ) as out: for s in ('count','mean','variance','min','max','numNonzeros'): print >>out, ' * {0}: {1}'.format( s, getattr(summary,s)() ) # And also save them to disk. Flat the list for that flat = intervals.map( lambda x: x[0] ) flat.saveAsTextFile( "hdfs:///user/{0}/data/tvlogs/intervals.txt".format(sc.sparkUser()) )
rddUSD = sc.textFile("dataUSDuprv.csv") rddUSD.persist() rddUSD.take(5) #deleting first row(header) header = rddUSD.first() dataLines = rddUSD.filter(lambda x: x != header) dataLines.count() dataLines.first() dataLines.take(5) #RDD to Dense vector vectorsUSD = dataLines.map(transformationDT.transformToNumeric) vectorsUSD.take(5) #Perform statistical Analysis statsUSD = Statistics.colStats(vectorsUSD) statsUSD.mean() statsUSD.variance() statsUSD.min() statsUSD.max() Statistics.corr(vectorsUSD) #SPARK SQL dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",") dataframe.registerTempTable("dataUSDuprv") dff1 = sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show() dataframe.show() #LabeledPoint lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint) lpUSD.take(5)
def main(argv): verbose = False dbpath = '/root/data/AdditionalFiles/' tagstring = 'rock' usealldata = False holdout = 0.1 model_iterations = 100 model_step = 1.0 model_intercept = True # possible types logistic and svm model_type = 'logistic' try: opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"]) except getopt.GetoptError: print 'rockTag.py -d <data path> -t <tag string>' sys.exit(2) for opt, arg in opts: if opt == '-h': print('rockTag.py -d <data path> -t <tag string>') sys.exit() elif opt in ("-v", "--verbose"): verbose = True elif opt in ("-d", "--datapath"): dbpath = arg elif opt in ("-t", "--tagstring"): tagstring = str(arg).lower() elif opt in ("-a", "--alldata"): usealldata = True elif opt in ("-m", "--model"): if str(arg).lower() in ['logistic','svm']: model_type = str(arg).lower else: print('valid models are logistic and svm') sys.exit() elif opt in ("-s", "--step"): model_step = float(arg) elif opt in ("-i", "--iterations"): model_iterations = int(arg) elif opt in ("-o", "--holdout"): holdout = float(arg) if holdout <= 0 | holdout >= 1: print('holdout must be greater than 0 and less than 1') elif opt in ("-c", "--intercept"): model_intercept = True if verbose: print('OUTPUT: Some Args') print('data path: ' + dbpath) print('tag string: ' + tagstring) labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata) # scale features summary = Statistics.colStats(features) means = summary.mean() sds = [vr**0.5 for vr in summary.variance()] # std = StandardScaler(True, True).fit(features) # features = std.transform(features) features = features.map(lambda data: [(v - m)/s for (v, m, s) in zip(data,means,sds)]) if verbose: print('OUTPUT: check resized column') smry = Statistics.colStats(features) print(smry.mean()) print(smry.variance()) # make labeled data # labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data)) labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data)) if verbose: print('OUTPUT: Labeled Data') print(labeledData.take(3)) # rebalance samples equalSampleData = rebalanceSample(labeledData, verbose=verbose) # split data trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout]) if verbose: print('OUTPUT: Train Data') trainData.map(lambda p: (p.label, p.features)).take(3) # train model if model_type == 'logistic': model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step) elif model_type == 'svm': model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step) evalString = evaluateModel(model, testData) print(evalString)
sc.setLogLevel('debug') sc.getConf().getAll() import urllib.request url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz' localfile = '/tmp/kddcup.data_10_percent.gz' f = urllib.request.urlretrieve(url, localfile) raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz') csv = raw_data.map(lambda x: x.split(',')) duration = raw_data.map(lambda x: [int(x[0])]) from pyspark.mllib.stat import Statistics summary = Statistics.colStats(duration) summary.mean()[0] summary.count() metrics = csv.map(lambda x: [x[0], x[4], x[5]]) metrics.take(2) Statistics.corr(metrics, method="spearman") Statistics.corr(metrics, method="pearson") from pyspark.mllib.linalg import Vectors visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3) print(Statistics.chiSqTest(visitors_freq))
values = Vectors.dense(float(attList[0]),float(attlist[1]) return values #keep only Cyl,Displacement autoVectors=dataLines.map(transformToNumeric) autoVectors.collect() #perform analysis from pyspark.mllib.stat import Statistics from pyspark.sql import SQLContext(sc) autostats=Statistics.colStats(autoVectors) autosatats.mean() autosatats.Variance() autosatats.min() autosatats.max() Statistics.corr(autoVector) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) def transformToLabelPoint(inStr): lp =(float(inStr[0]),vectors.dense(inStr[1])) return lp
#Summary stats of data # source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py from __future__ import print_function from pyspark import SparkContext import numpy as np from pyspark.mllib.stat import Statistics if __name__ == "__main__": sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext bike = sc.textFile("s3://irm238finalproject/input/*-citibike-tripdata.csv") # an RDD of Vectors # Compute column summary statistics. summary = Statistics.colStats(bike) print(summary.mean()) # a dense vector containing the mean value for each column print(summary.variance()) # column-wise variance print(summary.numNonzeros()) # number of nonzeros in each column sc.stop()
# Load input data print("Loading LIBSVM file with UDT from " + input + ".") df = spark.read.format("libsvm").load(input).cache() print("Schema from LIBSVM:") df.printSchema() print("Loaded training data as a DataFrame with " + str(df.count()) + " records.") # Show statistical summary of labels. labelSummary = df.describe("label") labelSummary.show() # Convert features column to an RDD of vectors. features = MLUtils.convertVectorColumnsFromML(df, "features") \ .select("features").rdd.map(lambda r: r.features) summary = Statistics.colStats(features) print("Selected features column with average values:\n" + str(summary.mean())) # Save the records in a parquet file. tempdir = tempfile.NamedTemporaryFile(delete=False).name os.unlink(tempdir) print("Saving to " + tempdir + " as Parquet file.") df.write.parquet(tempdir) # Load the records back. print("Loading Parquet file with UDT from " + tempdir) newDF = spark.read.parquet(tempdir) print("Schema from Parquet:") newDF.printSchema() try:
def summary_by_label(raw_data, label): label_vector_data = raw_data.map(parse_interaction_with_key).filter( lambda x: x[0] == label) return Statistics.colStats(label_vector_data.values())
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo=featuresDic, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=maxBins) ### Evalute # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\ .sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') # print(model.toDebugString()) ### Compute R2 SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1])).sum() summary = Statistics.colStats(testData.map(lambda x: Vectors.dense(x.label))) meanY = float(summary.mean()) # Alternative for mean # testData.map(lambda x: Vectors.dense(x.label)).mean() SST = testData.map(lambda y: (y.label-meanY)**2).sum() n = float(testData.count()) params = 3 Rsqrd = 1 - SSE/SST RsqrdAdj = 1 - SSE/(n-params)/(SST/(n-1)) print('R-sqruared: {0}'.format(Rsqrd)) print('R-sqruared Adj: {0}'.format(RsqrdAdj))
""" def parse_interaction(line): #split lines based on the delimeter, and create a list line_split = line.split(" ") #replace NA with zeros line_split = [w.replace('NA', '0') for w in line_split] #line_split = [w.replace ('', '0') for w in line_split] #keep all except year, and non-numeric values symbolic_indexes = [0, 8, 10, 16, 17, 22] clean_line_split = [ item for i, item in enumerate(line_split) if i not in symbolic_indexes ] return np.array([float(x) for x in clean_line_split]) vector_data = raw_data.map(parse_interaction) #start timer at this point startTime = datetime.now() summary = Statistics.colStats(vector_data) print('Time consumed = '), (datetime.now() - startTime) print('Mean of columns\n'), summary.mean() print('Variances of columns\n'), summary.variance() print('Non zero values\n'), summary.numNonzeros() print('Max value\n'), summary.max() print('Min value\n'), summary.min() sc.stop()
# print the data print(mat.collect()) # this could be a problem on large datasets print(mat.take(10)) # collect with the limit # Notes: Transformations are run on cluster - do not do this #rdd.foreach(println) # will print on each executor, what you want is #rdd.take(100).foreach(println) # Transformations: http://spark.apache.org/docs/latest/programming-guide.html#transformations # Actions # Some operations require key/values (reduceByKey, sortByKey, etc) # Compute column summary statistics. summary = Statistics.colStats(mat) print(summary.mean()) # a dense vector containing the mean value for each column print(summary.variance()) # column-wise variance print(summary.numNonzeros()) # number of nonzeros in each column # Correlations from pyspark.mllib.stat import Statistics seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series # seriesY must have the same number of partitions and cardinality as seriesX seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
#Filter out columns not wanted at this stage values= Vectors.dense([ outcome, age, single, married, \ divorced, primary, secondary, tertiary,\ default, balance, loan \ ]) return values #Change to a Vector bankVectors = dataLines.map(transformToNumeric) bankVectors.collect()[:15] #Perform statistical Analysis from pyspark.mllib.stat import Statistics bankStats = Statistics.colStats(bankVectors) bankStats.mean() bankStats.variance() bankStats.min() bankStats.max() Statistics.corr(bankVectors) #Transform to a Data Frame for input to Machine Learing #Drop columns that are not required (low correlation) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) def transformToLabeledPoint(inStr):
def colStats(self): """Return descriptive stats for all of our columns.""" return Statistics.colStats(self.data)
day = int(x[3:5]) year = int(x[6:10]) return(datetime.date(year,month,day).isocalendar()[1]) violent = ["ASSAULT","BATTERY","CRIM SEXUAL ASSAULT", "DOMESTIC VIOLENCE", "HOMICIDE", "KIDNAPPING"] def setFlags(x): if x in violent: return (0,1) else: return (1,0) beats = parts.map(lambda p:(p[10],p[2][6:10],getWeek(p[2]),1,setFlags(p[5]))) beats2 = beats.filter(lambda x:x[1]=="2015").map(lambda x:((x[0],x[2]),(x[3],x[4][0],x[4][1]))) beats3 = beats2.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2])) standard_vars = beats3.map(lambda row: Vectors.dense((row[0][1],row[1][0],row[1][1],row[1][2]))) summary = Statistics.colStats(standard_vars) mean_wn = summary.mean()[0] sd_wn = math.sqrt(summary.variance()[0]) mean_counts = list(summary.mean()[1:4]) sd_counts = list(np.sqrt(summary.variance()[1:4])) beats_standard = beats3.map(lambda x: (x[0][0],(x[0][1]-mean_wn)/(sd_wn),(x[1][0]-mean_counts[0])/sd_counts[0],(x[1][1]-mean_counts[1])/sd_counts[1], \ (x[1][2]-mean_counts[2])/sd_counts[2])) beats_list = beats_standard.map(lambda x: ((x[0]),1)).keys().distinct().collect() beats_list = beats_list[0:50] def parsePoint(tuple): values = [float(x) for x in tuple] return LabeledPoint(values[0], values[1:]) def deNorm(val,mean,sd): return(val*sd + mean) maxWeek = (21 - mean_wn) / sd_wn curWeek = (20 - mean_wn) / sd_wn
def summary_by_label(raw_data, label): from pyspark.mllib.stat import Statistics label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label) return Statistics.colStats(label_vector_data.values())
#Written for spark2.0 and higher from pyspark import SparkContext from pyspark.mllib.stat import Statistics import random as rand import numpy as np from pyspark.mllib.linalg import Vectors, Matrices def num_rand(): return np.random.randn(1, 4) mat = sc.parallelize([num_rand(), num_rand(), num_rand(), num_rand()]) summary = Statistics.colStats(mat) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros()) print(summary.max()) print(summary.min()) print(summary.count()) print(summary.normL1()) print(summary.normL2()) #correlation x = sc.parallelize(np.random.randn(4, 1)) y = sc.parallelize(np.random.randn(4, 1)) print("Correlation :", str(Statistics.corr(x, y)))
print (" ") print (" ") print ("matriz de correlacion:") print (" ") print(Statistics.corr(rows, method="pearson") ''' file=sc.textFile("Process_Data/SuperFile/superfile.dat") row = file.map(lambda line:line.split(' ')[1:len(line)]).map(lambda xs: [float(x) for x in xs]) row_list= row.collect() #transforms to list print(row_list) #matrix w, h = 1,38 new_list = [[0 for x in range(w)] for y in range(h)] for i in range(0,len(row_list)): new_list[i][:]=Vectors.dense(row_list[i]) i+=1 rows = sc.parallelize([new_list]) print(rows) summary = Statistics.colStats(rows) print("media:"),(summary.mean()) print("varianza:"),(summary.variance()) print ("max:"),(summary.max()) print ("min:"),(summary.min()) print("non Zeros:"),(summary.numNonzeros())
For dense vectors, MLlib uses either Python lists or the NumPy array type. The later is recommended, so you can simply pass NumPy arrays around. For sparse vectors, users can construct a SparseVector object from MLlib or pass SciPy scipy.sparse column vectors if SciPy is available in their environment. The easiest way to create sparse vectors is to use the factory methods imlpemented in Vectors. """ def parse_interaction (line): #split lines based on the delimeter, and create a list line_split = line.split (",") #replace NA with zeros line_split = [w.replace ('NA', '0') for w in line_split] #line_split = [w.replace ('', '0') for w in line_split] #keep all except year, and non-numeric values symbolic_indexes = [0, 8, 10,16, 17, 22] clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes] return np.array ([float (x) for x in clean_line_split]) vector_data = raw_data.map (parse_interaction) #start timer at this point startTime = datetime.now() summary = Statistics.colStats(vector_data) print ('Time consumed = '), (datetime.now() - startTime) print ('Mean of columns\n'), summary.mean () print ('Variances of columns\n'), summary.variance() print ('Non zero values\n'), summary.numNonzeros() print ('Max value\n'), summary.max () print ('Min value\n'), summary.min ()
elif (hourofday < 12): morn += 1 elif (hourofday < 17): aft += 1 elif (hourofday < 22): eve += 1 else: night += 1 return [len(tracklist), morn, aft, eve, night, mcount] # Compute profile for each user custdata = tbycust.mapValues(lambda a: compute_stats_byuser(a)) # Compute aggregate stats for an entire track library aggdata = Statistics.colStats(custdata.map(lambda x: x[1])) for k, v in custdata.collect(): unique, morn, aft, eve, night, mobile = v tot = morn + aft + eve + night # Persist the data, in this case write to file with open(outdir + 'live_table.csv', 'ab') as csvfile: fwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) fwriter.writerow([unique, morn, aft, eve, night, mobile]) # Do the same with the summary data with open(outdir + 'agg_table.csv', 'wb') as csvfile: fwriter = csv.writer(csvfile, delimiter=' ', quotechar='|',
, featureSubsetStrategy="auto" , impurity='variance' , maxDepth=13 , maxBins=32) # evaluate the training error # first make the prediction and create a new "vector" of all the predictions trainpredictions = model1.predict(trainparsedData.map(lambda x: x.features)) # then you column bind the prediction and actual values into a new RDD trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions) # use map operation to compute MSE trainMSE1 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count()) # use the the Statistics library to obtain the variance summary = Statistics.colStats(trainvecData) variance = summary.variance()[0] # compute the pseudo R-square train_Rsqr1 = 1 - trainMSE1/float(variance) # evaluate the testing error # first make the prediction and create a new "vector" of all the predictions testpredictions = model1.predict(testparsedData.map(lambda x: x.features)) # then you column bind the prediction and actual values into a new RDD testlabelsAndPredictions = testparsedData.map(lambda lp: lp.label).zip(testpredictions) # use map operation to compute MSE testMSE1 = testlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testparsedData.count()) # use the the Statistics library to obtain the variance summary = Statistics.colStats(testvecData)
sc=SparkContext() sqlContext = SQLContext(sc) rddUSD = sc.textFile("../Forex DT/data/1440/USD1440.csv") rddUSD.cache() #Remove the first line header=rddUSD.first() dataLines = rddUSD.filter(lambda x: x != header) dataLines.take(5) usdVectors = dataLines.map(transformationLR.transformToNumeric) #Perform statistical Analysis usdStats=Statistics.colStats(usdVectors) usdStats.mean() usdStats.variance() usdStats.min() usdStats.max() Statistics.corr(usdVectors) #Transform to a Data Frame for input to Machine Learing #Drop columns that are not required (low correlation) usdLP = usdVectors.map(transformationLR.transformToLabeledPoint) usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"]) usdDF.select("label", "features").show(10) #Split into training and testing data (trainingData, testData) = usdDF.randomSplit([0.7, 0.3])
#Get The Average current_gni_final_maped = current_gni_reduced.mapValues(lambda x: x[0] / x[1]) current_gni_final_maped.cache() current_gni_final_maped.collect() print("Average per Country collected") #current_gni_final_maped.foreach(println) # Make Vector the data autoVector = current_gni_final_maped.map(transformToVector) autoVector.persist() autoVector.collect() print("Vectorized Average") autoVector.foreach(println) # Centering and scaling, substract every colom with that colomn means, and divided by its std. deviation autoStats = Statistics.colStats(autoVector) colMeans=autoStats.mean() #resulting numpy array print("Means:") print(colMeans) colVariance=autoStats.variance() print("Variances:") print(colVariance) colStdDev=map(lambda x: math.sqrt(x), colVariance) #colStdDev.collect() print("StdDev:") #colStdDev.foreach(println) print(colStdDev) #Place the means and std.dev values in a broadcast variable
def compute_mean(tx_data_rdd): summary = Statistics.colStats(tx_data_rdd) return summary.mean()
#Summary stats of data # source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py from __future__ import print_function from pyspark import SparkContext import numpy as np from pyspark.mllib.stat import Statistics if __name__ == "__main__": sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext bike = sc.textFile("s3://irm238finalproject/input/*-citibike-tripdata.csv" ) # an RDD of Vectors # Compute column summary statistics. summary = Statistics.colStats(bike) print(summary.mean() ) # a dense vector containing the mean value for each column print(summary.variance()) # column-wise variance print(summary.numNonzeros()) # number of nonzeros in each column sc.stop()
numRecords = rideRDD.count() minDuration = rideRDD.map(lambda x : x[0]).min() maxDuration = rideRDD.map(lambda x : x[0]).max() print "Number of records : %d " % numRecords print "Minimum duration : %d " % minDuration print "Maximum duration : %d " % maxDuration # #### 1(b) Use MLLib Statistics # In[132]: from pyspark.mllib.stat import Statistics summary = Statistics.colStats(rideRDD) print "Duration\tMorning\tAfternoon\tEvening\tWeekday\tMale\tAge\n" print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.mean()) print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.variance()) print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.numNonzeros()) # #### 1(c) Determine correlation of Age with Duration # In[3]: durationRDD = rideRDD.map(lambda x : x[0]) # Extract duration from the RDD ageRDD = rideRDD.map(lambda x : x[6]) # Extract Age from the RDD print(Statistics.corr(durationRDD, ageRDD, method="pearson")) # Print the Pearson correlation of Age vs. Duration
#Summary stats of data # source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py from __future__ import print_function from pyspark import SparkContext import numpy as np from pyspark.mllib.stat import Statistics if __name__ == "__main__": sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext taxi = sc.textFile("s3://irm238finalproject/input/yellow*") # an RDD of Vectors taxic = tax.filter(lambda line: line[1:10]).map(lambda row: row.split(",")) # Compute column summary statistics. summary = Statistics.colStats(taxi) print(summary.mean()) # a dense vector containing the mean value for each column print(summary.variance()) # column-wise variance print(summary.numNonzeros()) # number of nonzeros in each column sc.stop()
def test_col_norms(self): data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10) summary = Statistics.colStats(data) self.assertEqual(10, len(summary.normL1())) self.assertEqual(10, len(summary.normL2()))