Example #1
0
    def run4(self):
        from my_fun import parse_interaction,parse_interaction_with_key,summary_by_label

        raw_data = self.raw_data
        vector_data = raw_data.map(parse_interaction)
        # Compute column summary statistics.
        summary = Statistics.colStats(vector_data)

        print "Duration Statistics:"
        print " Mean: {}".format(round(summary.mean()[0],3))
        print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3))
        print " Max value: {}".format(round(summary.max()[0],3))
        print " Min value: {}".format(round(summary.min()[0],3))
        print " Total value count: {}".format(summary.count())
        print " Number of non-zero values: {}".format(summary.numNonzeros()[0])

        label_vector_data = raw_data.map(parse_interaction_with_key)
        normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.")

        normal_summary = Statistics.colStats(normal_label_data.values())

        print "Duration Statistics for label: {}".format("normal")
        print " Mean: {}".format(normal_summary.mean()[0],3)
        print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3))
        print " Max value: {}".format(round(normal_summary.max()[0],3))
        print " Min value: {}".format(round(normal_summary.min()[0],3))
        print " Total value count: {}".format(normal_summary.count())
        print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0])

        normal_sum = summary_by_label(raw_data, "normal.")

        print "Duration Statistics for label: {}".format("normal")
        print " Mean: {}".format(normal_sum.mean()[0],3)
        print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3))
        print " Max value: {}".format(round(normal_sum.max()[0],3))
        print " Min value: {}".format(round(normal_sum.min()[0],3))
        print " Total value count: {}".format(normal_sum.count())
        print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0])

        label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.",
                      "imap.","ipsweep.","land.","loadmodule.","multihop.",
                      "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.",
                      "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.",
                      "warezmaster."]
        stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list]

        duration_by_label = [
            (stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())]))
            for stat in stats_by_label]

        pd.set_option('display.max_columns', 50)

        stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index')

        print "Duration statistics, by label"
        stats_by_label_df
Example #2
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Example #3
0
    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)
Example #4
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Example #5
0
 def test_col_with_different_rdds(self):
     # numpy
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(1000, summary.count())
     # array
     data = self.sc.parallelize([range(10)] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
     # array
     data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, summary.count())
Example #6
0
def do_all(f_path,out_name):
	sc = SparkContext()
	data = sc.textFile(f_path)

	data = data.map(parseKeepD).filter(lambda p: p[0] != None)

	# Scale Features
	features = data.map(lambda x: x[0].features)
	summary = Statistics.colStats(features)
	global means
	global varis
	means = summary.mean()
	varis = summary.variance()

	#scale the points
	data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))

	#train model
	model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')

	#calculate disparity
	disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))  

	#calculate SSR for later
	ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()

	#keep N
	N = disparity.count()
	#shut down SC
	MSE = ssr/float(N)
	se = std_errors(data,MSE,N)
	disparity.saveAsTextFile(out_loc + out_name)

	sc.stop()
	return model.intercept,model.weights,se,disparity, ssr, N
Example #7
0
def generateFeatureClusters(context, geneExp, samples, headers, numClusters):

    # Ignore the first item (the diagnosis header)
    headers = headers[1:]
    # 1) Generate statistic data for each of the genes/entrez ids

    # Retrieve the mean, variance, max and min of each gene
    # The entrez id associate with each gene is the row index (matches to the headers index)
    cStats = Statistics.colStats(geneExp)
    print(len(cStats.mean()))
    data = np.array(
        [cStats.mean(),
         cStats.variance(),
         cStats.max(),
         cStats.min()]).transpose()
    # Create a stats array with the index as first column
    # e_id for e_id in headers
    dataWithIndex = np.array([[e_id for e_id in headers],
                              cStats.mean(),
                              cStats.variance(),
                              cStats.max(),
                              cStats.min()]).transpose()
    print(dataWithIndex.shape)
    # 2) Create dataframes that will be used to train KMeans

    # Create dataframe for the stats data (with no entrez ids)
    df = context.parallelize(data)
    # create dataframe for the stats data (with entrez ids)
    # Will be used to cluster features later
    dfWithIndex = context.parallelize(dataWithIndex)

    # 3) Train KMeans with statistic data
    # use the stats data to discover clusters for the genes
    model = KMeans.train(df,
                         numClusters,
                         maxIterations=100,
                         initializationMode="random")

    # 4) save model
    model.save(context, './models/clusters')

    # 5) Label each feature with their cluster
    # For each gene statistic, map it to (prediction, e_id)
    clusterLabeledFeatures = dfWithIndex.map(
        lambda point: (model.predict(point[1:]), point[0]))

    featuresToCluster = dfWithIndex.map(lambda point: point[0],
                                        (model.predict(point[1:])))

    # 6) Group together the features by their cluster label
    clusteredFeatures = clusterLabeledFeatures.groupByKey()
    #print(clusteredFeatures.count())
    #print(clusteredFeatures.take(2))

    cF = clusteredFeatures.collectAsMap()

    # 7) Transform the sample data to use the clusters
    samplesWithClusters = samples.map(lambda sample: updateSample(sample, cF))

    return samplesWithClusters
Example #8
0
def calculateStats(years2stats, op):
	result = dict()
	for year in years2stats:
		stats = sc.parallelize(years2stats[year])
		summary = Statistics.colStats(stats)
		if op == 'mean':
			means = summary.mean()
			valuesList = []
			for singleElement in means:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
		if op == 'variance':
			variances = summary.variance()
			valuesList = []
			for singleElement in variances:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
		if op == 'max':
			maxValue = summary.max()
			valuesList = []
			for singleElement in maxValue:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
		if op == 'min':
			minValue = summary.min()
			valuesList = []
			for singleElement in minValue:
				valuesList.append(str(singleElement).rstrip())
			result[year] = valuesList
	return result
Example #9
0
def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()
Example #10
0
def summarize(dataset):
    print "schema: %s" % dataset.schema().json()
    labels = dataset.map(lambda r: r.label)
    print "label average: %f" % labels.mean()
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print "features average: %r" % summary.mean()
Example #11
0
    def scriptJob(self, limit=None, rowstart=None, rowstop=None):
        start = datetime.datetime.now()
        # create hbase connection

        row = self.table.scan(row_start=rowstart,
                              row_stop=rowstop,
                              limit=limit,
                              columns=self.columns)
        print(type(row))

        testRdd = self.sc.parallelize(row)
        values = testRdd.values()
        print(values.count())

        col = bytes(self.columns.encode("utf-8"))
        serilizeRdd = values.map(lambda value: float(value.get(col).decode()))

        #
        # def hash_domain(url):
        #     return hash(urlparse.urlparse(url).netloc)

        mlibRDD = self.sc.parallelize(
            (([Vectors.dense(x)]) for x in serilizeRdd.collect()))

        cStats = Statistics.colStats(mlibRDD)
        # print(cStats.mean())

        end = datetime.datetime.now()
        print(end - start)
        return cStats.mean()
Example #12
0
def CorrelationFeature(vectors):

    matriz = sc.broadcast(Statistics.corr(vectors, method="pearson"))

    summary = Statistics.colStats(vectors)

    varianza = summary.variance()

    #########new heuristic diogo proposal
    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = varianza[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()],
               reverse=True)  #features sorted

    index = []
    for i in r:
        index.append(i[1])

    index = index[0:6]  #tacking the first 6 features

    return index
Example #13
0
def info_paragraphs(df, clm):
    df = df.where(col(clm).isNotNull())
    paragraphs = df.rdd.flatMap(lambda x: getattr(x, clm)).filter(
        lambda p: p != None)
    paragraphs = paragraphs.map(lambda p: np.array(len(p.split())))
    summary = Statistics.colStats(paragraphs)

    return summary
Example #14
0
def column_means(data: pyspark.rdd.RDD):
    """
    Compute vectors of column means.
`
    :param data: an RDD
    :return: returns column means as vector
    """

    logger.info("Computing data means")
    summary = Statistics.colStats(data)
    return summary.mean()
Example #15
0
def column_statistics(data: pyspark.rdd.RDD):
    """
    Compute vectors of column means and variances of a data frame.
`
    :param data: an RDD
    :return: returns column means and variances as vectors
    """

    logger.info("Computing data statistics")
    summary = Statistics.colStats(data)
    return summary.mean(), summary.variance()
def average_vector(data):
	from pyspark.sql.functions import col
	vectors = data.select("vectors").where(col("vectors").isNotNull())

	from pyspark.mllib.linalg import Vectors
	vectors_v = vectors.map(lambda line: Vectors.dense(line))

	from pyspark.mllib.stat import Statistics
	summary = Statistics.colStats(vectors_v)
	mean = summary.mean()
	logger.info(mean)
	return mean
Example #17
0
def CorrelationFeature(vectors):

    #	print 'Calculation Correlation'

    matriz = sc.broadcast(Statistics.corr(vectors, method="pearson"))

    summary = Statistics.colStats(vectors)

    varianza = summary.variance()

    #########new heuristic diogo proposal
    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = varianza[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()],
               reverse=True)  #features sorted

    #print r

    #	print 'calculating features selections'

    #Old heuristic
    # # w={}
    # # for i in range(len(matriz)):
    # # 	w[i]=0
    # # 	for j in np.nan_to_num(matriz[i]):
    # # 		k=abs(j)
    # # 		w[i]=w[i]+k

    # r=sorted([(value,key) for (key,value) in w.items()],reverse=True)

    #####""
    #vectors=np.matrix(vectors)
    #beforeMatrix=vectors.map(lambda x: np.matrix(x))

    index = []
    for i in r:
        index.append(i[1])

    index = index[0:6]  #tacking the first 6 features

    #MatrixReducer(vectors,index)
    return index
Example #18
0
def readDataFromES():
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    results_gen = elasticsearch.helpers.scan(
        es,
        index='netflowrepo',
        doc_type='entry',
        query={"query": {
            "match_all": {}
        }})

    results = list(results_gen)

    sumOfFlows_list = []
    sumOfBytes_list = []
    uniqDstIPs_list = []
    uniqDstPorts_list = []

    for row in results:
        sumOfFlows_list.append(row['_source']['sumOfFlows'])
        sumOfBytes_list.append(row['_source']['sumOfBytes'])
        uniqDstIPs_list.append(row['_source']['uniqDstIPs'])
        uniqDstPorts_list.append(row['_source']['uniqDstPorts'])

    # Convert data to numpy arrays.
    np_Flows = np.array(sumOfFlows_list)
    np_Bytes = np.array(sumOfBytes_list)
    np_DstIPs = np.array(uniqDstIPs_list)
    np_DstPorts = np.array(uniqDstPorts_list)

    # Convert data into Matrix. Each feature is in a column.
    tmp1 = np.concatenate((np_Flows.reshape((-1, 1)), np_Bytes.reshape(
        (-1, 1))),
                          axis=1)
    tmp2 = np.concatenate((tmp1, np_DstIPs.reshape((-1, 1))), axis=1)
    tmp3 = np.concatenate((tmp2, np_DstPorts.reshape((-1, 1))), axis=1)
    mat = sc.parallelize(tmp3.tolist())

    summary = Statistics.colStats(mat)

    print("count =", summary.count())
    print("mean =", summary.mean())
    print("min =", summary.min())
    print("max =", summary.max())
    print("variance =", summary.variance())

    mean = summary.mean()
    max = summary.max()
    stddev = np.sqrt(summary.variance())

    return (mean, max, stddev)
Example #19
0
def cities_stats(city_rdd):
    pop_rdd = city_rdd.map(lambda city: [city[1]])
    statistics = Statistics.colStats(pop_rdd)
    mean = statistics.mean()[0]
    variance = statistics.variance()[0]
    max_pop = statistics.max()[0]
    min_pop = statistics.min()[0]
    res = {
        "mean": mean,
        "variance": variance,
        "deviation": math.sqrt(variance),
        "max": max_pop,
        "min": min_pop
    }
    return res
Example #20
0
	def recommend2user(self,user_id):
		
		query = '''select page_id from cooladata where date_range(last 21 days) and user_id = {:d} and page_id is not null group by page_id;'''.format(user_id)

		def SQLtoURL(query):
			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
			return data


		def QueryXXXXX(query, file = None):
			session = Session()
			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
			return response.content
		


		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  

		def convert_row(row):
			rowlist = []
			rowlist = [d['v'] for d in row]
			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		historyTitleData = self.spark.createDataFrame(rd, table_cols)
		historyTitleData = historyTitleData.dropna()
		
		self.model.createOrReplaceTempView("Database")
		historyTitleData.registerTempTable("historyTable")
		
		pageVectorHistory = self.spark.sql('''select d.page_id, d.normTopicDist, case when h.page_id is null then 0 else 1 end as label from Database as d left join historyTable as h on d.page_id = h.page_id''')
		
		mainRdd = pageVectorHistory[pageVectorHistory['label'] == 1][['normTopicDist']].rdd.map(lambda x: x['normTopicDist'].toArray())
		mainVec = Statistics.colStats(mainRdd).mean()

		pageRank = pageVectorHistory[pageVectorHistory['label'] == 0].rdd.map(lambda row: (row['page_id'], float(np.dot(mainVec, row['normTopicDist'].toArray()))))
		pager = pageRank.toDF()
		pager.createOrReplaceTempView("pager")
		sortPageR = self.sqlctx.sql('''select _1 as page_id, _2 as similarity from pager order by similarity desc''')

		return sortPageR.take(10)
Example #21
0
def CorrelationFeature(vectors, schema):

    print("Calculating Correlation")

    vectors_rdd = vectors.rdd.map(
        lambda row: Vectors.dense([x for x in row["features"]]))

    matriz = spark.sparkContext.broadcast(
        Statistics.corr(vectors_rdd, method="pearson"))

    summary = Statistics.colStats(vectors_rdd)

    variance = summary.variance()

    ######## Heurística ########

    w = {}
    aij = {}
    for i in range(len(matriz.value)):
        w[i] = 0
        aij[i] = 0
        for j in np.nan_to_num(matriz.value[i]):
            k = abs(j)
            aij[i] = aij[i] + k
        w[i] = variance[i] / aij[i]

    r = sorted([(value, key) for (key, value) in w.items()], reverse=True)

    index = r[0:6]

    a = []

    for i in index:
        a.append((0, int(i[1])))

    red = MatrixReducer(vectors_rdd, a, schema)

    return red
weights = Vectors.dense([0.2, 0.1, 0.1, 0.1, 0.5, 0.5, 0.7, 0.9, 1.0])

# instantiate an ElementWiseProduct object and initialize with the weights vector
ep = ElementwiseProduct(weights)

# transform vecrdd using the transform method of the ElementWiseProduct object
# to create an RDD of weighted values
# print the top line of each RDD to confirm that the transformation was successful
weighted = ep.transform(vecrdd)

print weighted.take(1)
print vecrdd.take(1)

# call the colStats method of the Statistics object on vecrdd and print the
# mean, variance, and number of non-zero values
stats = Statistics.colStats(vecrdd)

print stats.mean()
print stats.variance()
print stats.numNonzeros()

# instantiate a StandardScaler object and set withMean and withStd to 'True'
ss = StandardScaler(withMean=True, withStd=True)

# call the fit method of the StandardScaler object to create a StandardScalerModel
model = ss.fit(vecrdd)

# call the transform method of the StandardScalerModel to center and scale the data
# in vecrdd RDD
scaled = model.transform(vecrdd)
Example #23
0
def summarize(dataset):
    labels = dataset.map(lambda r: r.label)
    print("label average: %f" % labels.mean())
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("features average: %r" % summary.mean())
Example #24
0
rddUSD = sc.textFile("dataUSDuprv.csv")
rddUSD.persist()
rddUSD.take(5)
#deleting first row(header)
header=rddUSD.first()
dataLines = rddUSD.filter(lambda x: x != header)
dataLines.count()
dataLines.first()
dataLines.take(5)

#RDD to Dense vector
vectorsUSD = dataLines.map(transformationDT.transformToNumeric)
vectorsUSD.take(5)

#Perform statistical Analysis
statsUSD=Statistics.colStats(vectorsUSD)
statsUSD.mean()
statsUSD.variance()
statsUSD.min()
statsUSD.max()
Statistics.corr(vectorsUSD)

#SPARK SQL
dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",")
dataframe.registerTempTable("dataUSDuprv")
dff1=sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show()
dataframe.show()


#LabeledPoint
lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
Example #25
0
    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()
    print("Schema from LIBSVM:")
    df.printSchema()
    print("Loaded training data as a DataFrame with " +
          str(df.count()) + " records.")

    # Show statistical summary of labels.
    labelSummary = df.describe("label")
    labelSummary.show()

    # Convert features column to an RDD of vectors.
    features = MLUtils.convertVectorColumnsFromML(df, "features") \
        .select("features").rdd.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("Selected features column with average values:\n" +
          str(summary.mean()))

    # Save the records in a parquet file.
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print("Saving to " + tempdir + " as Parquet file.")
    df.write.parquet(tempdir)

    # Load the records back.
    print("Loading Parquet file with UDT from " + tempdir)
    newDF = spark.read.parquet(tempdir)
    print("Schema from Parquet:")
    newDF.printSchema()
    shutil.rmtree(tempdir)
Example #26
0
#Load the CSV file into a RDD
sc = SparkContext()
sqlContext = SQLContext(sc)
rddUSD = sc.textFile("../Forex DT/data/1440/USD1440.csv")
rddUSD.cache()

#Remove the first line
header = rddUSD.first()
dataLines = rddUSD.filter(lambda x: x != header)
dataLines.take(5)

usdVectors = dataLines.map(transformationLR.transformToNumeric)

#Perform statistical Analysis

usdStats = Statistics.colStats(usdVectors)
usdStats.mean()
usdStats.variance()
usdStats.min()
usdStats.max()
Statistics.corr(usdVectors)
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

usdLP = usdVectors.map(transformationLR.transformToLabeledPoint)
usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"])
usdDF.select("label", "features").show(10)

#Split into training and testing data
(trainingData, testData) = usdDF.randomSplit([0.7, 0.3])
trainingData.count()
Example #27
0
##### En trichant #####
# Utilisation de pandas pour résumer les données + afficher la matrice de corrélation
df = pd.read_csv("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF+".csv", sep = ";",header=0)
df.describe()
# Matrice de corrélation
# print(df.corr())


# ### Mllib Statistics

# In[5]:

from pyspark.mllib.stat import Statistics
# Basics Statistics
partsNum = parts.map(lambda line: line[0:8])
summary = Statistics.colStats(partsNum)
print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
Statistics.corr(partsNum, method="pearson")


# # Classification supervisée

# ## Naive Bayes

# In[6]:

from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
import utils_mesure
nomF_svm = "glass_svm"
Example #28
0
def summarize(dataset):
    labels = dataset.map(lambda r: r.label)
    print("label average: %f" % labels.mean())
    features = dataset.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("features average: %r" % summary.mean())
    return np.array([float(x) for x in clean_line_split])


vector_data = raw_data.map(parse_interaction)

# ## Summary statistics

# Spark's MLlib provides column summary statistics for `RDD[Vector]` through the function [`colStats`](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics.colStats) available in [`Statistics`](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics). The method returns an instance of [`MultivariateStatisticalSummary`](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary), which contains the column-wise *max*, *min*, *mean*, *variance*, and *number of nonzeros*, as well as the *total count*.

# In[4]:

from pyspark.mllib.stat import Statistics
from math import sqrt

# Compute column summary statistics.
summary = Statistics.colStats(vector_data)

print("Duration Statistics:")
print(" Mean: {}".format(round(summary.mean()[0], 3)))
print(" St. deviation: {}".format(round(sqrt(summary.variance()[0]), 3)))
print(" Max value: {}".format(round(summary.max()[0], 3)))
print(" Min value: {}".format(round(summary.min()[0], 3)))
print(" Total value count: {}".format(summary.count()))
print(" Number of non-zero values: {}".format(summary.numNonzeros()[0]))

# ### Summary statistics by label

# The interesting part of summary statistics, in our case, comes from being able to obtain them by the type of network attack or 'label' in our dataset. By doing so we will be able to better characterise our dataset dependent variable in terms of the independent variables range of values.

# If we want to do such a thing we could filter our RDD containing labels as keys and vectors as values. For that we just need to adapt our `parse_interaction` function to return a tuple with both elements.
Example #30
0
                                    maxDepth=5,
                                    maxBins=maxBins)
### Evalute
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
    .sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
# print(model.toDebugString())

### Compute R2
SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) *
                               (v_p1[0] - v_p1[1])).sum()
summary = Statistics.colStats(testData.map(lambda x: Vectors.dense(x.label)))
meanY = float(summary.mean())

# Alternative for mean
# testData.map(lambda x: Vectors.dense(x.label)).mean()
SST = testData.map(lambda y: (y.label - meanY)**2).sum()

n = float(testData.count())
params = 3

Rsqrd = 1 - SSE / SST
RsqrdAdj = 1 - SSE / (n - params) / (SST / (n - 1))

print('R-sqruared: {0}'.format(Rsqrd))
print('R-sqruared Adj: {0}'.format(RsqrdAdj))
Example #31
0
    d1.signal = scale(d1.signal, axis=1)
    reader.seek(j)
    d2 = reader.read()
    d2.signal = scale(d2.signal, axis=1)
    return i, j, exp(-euclidean(d1.signal[7], d2.signal[7]) ** 2)


if __name__ == '__main__':
    counts = []
    sc = pyspark.SparkContext()
    cchunks = list(chunk_combinations(0, 8192, 512))
    start = time.time()
    fit_rdd = sc.parallelize(chunks(0, 8182, 512)).flatMap(chunk_fits).sortByKey()
    fit_rdd.saveAsTextFile('fit_rdd.txt')
    fit_rdd.cache()
    stats = Statistics.colStats(fit_rdd.values())
    means = stats.mean()
    scales = np.sqrt(stats.variance())
    fit_rdd = fit_rdd.mapValues(lambda x: (x - means) / scales)
    # values = fit_rdd.values()
    # values.cache()
    # km = KMeans().train(values, 3)
    # predictions = km.predict(values)
    # with open('predictions.txt', 'w') as f:
    #     f.write('index,p0,p1,p2,p3,res,category')
    #     for temp, pred in izip(fit_rdd.collect(), predictions.collect()):
    #         key, value = temp
    #         f.write('\n%i,%f,%f,%f,%f,%f,%i' % (key, value[0], value[1], value[2], value[3], value[4], pred))
    #         pass
    # print km.clusterCenters
    # rdd = sc.parallelize(cchunks)
Example #32
0

# Create a Spark context and set it to work
with SparkContext(conf=conf) as sc:

    # Read the parsed records. This time we are reading the serialized file. 
    # So in each record the fields will already be split
    directory = "hdfs:///user/{0}/data/tvlogs/".format( sc.sparkUser() )
    logs = sc.pickleFile( "{0}{1}".format(directory,name) )

    # Group records by user
    byUser = logs.map( lambda x : (x[0],x[1:]) ).groupByKey()

    # Compute the time difference between consecutive records of each user
    intervals = byUser.flatMap( lambda (x,y) : time_intervals(y) )

    # keep it for reusing
    intervals.cache()
    
    # Extract statistics from those time differences
    # Note that colStats needs a Vector (or a Python list), since it computes by column
    # In our case we have a 1-column list
    summary = Statistics.colStats(intervals)
    with open( 'interval-stats.txt', 'w' ) as out:
        for s in ('count','mean','variance','min','max','numNonzeros'):
            print >>out, ' * {0}: {1}'.format( s, getattr(summary,s)() )

    # And also save them to disk. Flat the list for that
    flat = intervals.map( lambda x: x[0] )
    flat.saveAsTextFile( "hdfs:///user/{0}/data/tvlogs/intervals.txt".format(sc.sparkUser()) )
Example #33
0
rddUSD = sc.textFile("dataUSDuprv.csv")
rddUSD.persist()
rddUSD.take(5)
#deleting first row(header)
header = rddUSD.first()
dataLines = rddUSD.filter(lambda x: x != header)
dataLines.count()
dataLines.first()
dataLines.take(5)

#RDD to Dense vector
vectorsUSD = dataLines.map(transformationDT.transformToNumeric)
vectorsUSD.take(5)

#Perform statistical Analysis
statsUSD = Statistics.colStats(vectorsUSD)
statsUSD.mean()
statsUSD.variance()
statsUSD.min()
statsUSD.max()
Statistics.corr(vectorsUSD)

#SPARK SQL
dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",")
dataframe.registerTempTable("dataUSDuprv")
dff1 = sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show()
dataframe.show()

#LabeledPoint
lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
lpUSD.take(5)
def main(argv):

	verbose = False

	dbpath = '/root/data/AdditionalFiles/'
	tagstring = 'rock'
	usealldata = False

	holdout = 0.1
	model_iterations = 100
	model_step = 1.0
	model_intercept = True

	# possible types logistic and svm
	model_type = 'logistic'

	try:
		opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
	except getopt.GetoptError:
		print 'rockTag.py -d <data path> -t <tag string>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print('rockTag.py -d <data path> -t <tag string>')
			sys.exit()
		elif opt in ("-v", "--verbose"):
			verbose = True
		elif opt in ("-d", "--datapath"):
			dbpath = arg
		elif opt in ("-t", "--tagstring"):
			tagstring = str(arg).lower()
		elif opt in ("-a", "--alldata"):
			usealldata = True
		elif opt in ("-m", "--model"):
			if str(arg).lower() in ['logistic','svm']:
				model_type = str(arg).lower
			else:
				print('valid models are logistic and svm')
				sys.exit()
		elif opt in ("-s", "--step"):
			model_step = float(arg)
		elif opt in ("-i", "--iterations"):
			model_iterations = int(arg)
		elif opt in ("-o", "--holdout"):
			holdout = float(arg)
			if holdout <= 0 | holdout >= 1:
				print('holdout must be greater than 0 and less than 1')
		elif opt in ("-c", "--intercept"):
			model_intercept = True

	if verbose:
		print('OUTPUT: Some Args')
		print('data path: ' + dbpath)
		print('tag string: ' + tagstring)

	labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)

	# scale features
	summary = Statistics.colStats(features)
	means = summary.mean()
	sds = [vr**0.5 for vr in summary.variance()]
#	std = StandardScaler(True, True).fit(features)
#	features = std.transform(features)
	
	features = features.map(lambda data: [(v - m)/s for (v, m, s) in zip(data,means,sds)])
	if verbose:
		print('OUTPUT: check resized column')
		smry = Statistics.colStats(features)
		print(smry.mean())
		print(smry.variance())


	# make labeled data
#	labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
	labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
	if verbose:
		print('OUTPUT: Labeled Data')
		print(labeledData.take(3))

	# rebalance samples
	equalSampleData = rebalanceSample(labeledData, verbose=verbose)

	# split data
	trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
	if verbose: 
		print('OUTPUT: Train Data')
		trainData.map(lambda p: (p.label, p.features)).take(3)

	# train model
	if model_type == 'logistic':
		model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
	elif model_type == 'svm':
		model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)

	evalString = evaluateModel(model, testData)
	print(evalString)
Example #35
0
sc.setLogLevel('debug')
sc.getConf().getAll()

import urllib.request

url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
localfile = '/tmp/kddcup.data_10_percent.gz'
f = urllib.request.urlretrieve(url, localfile)

raw_data = sc.textFile('file:///tmp/kddcup.data_10_percent.gz')
csv = raw_data.map(lambda x: x.split(','))
duration = raw_data.map(lambda x: [int(x[0])])

from pyspark.mllib.stat import Statistics

summary = Statistics.colStats(duration)
summary.mean()[0]
summary.count()

metrics = csv.map(lambda x: [x[0], x[4], x[5]])
metrics.take(2)

Statistics.corr(metrics, method="spearman")

Statistics.corr(metrics, method="pearson")

from pyspark.mllib.linalg import Vectors

visitors_freq = Vectors.dense(0.13, 0.61, 0.8, 0.5, 0.3)
print(Statistics.chiSqTest(visitors_freq))
Example #36
0
values = Vectors.dense(float(attList[0]),float(attlist[1])
	return values	
						

#keep only Cyl,Displacement
autoVectors=dataLines.map(transformToNumeric)		
autoVectors.collect()

#perform analysis
from pyspark.mllib.stat import Statistics
from pyspark.sql import SQLContext(sc)


						
autostats=Statistics.colStats(autoVectors)
autosatats.mean()
autosatats.Variance()						
autosatats.min()
autosatats.max()
Statistics.corr(autoVector)


from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

def transformToLabelPoint(inStr):
lp =(float(inStr[0]),vectors.dense(inStr[1]))
return lp

Example #37
0
#Summary stats of data
# source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py

from __future__ import print_function

from pyspark import SparkContext

import numpy as np

from pyspark.mllib.stat import Statistics


if __name__ == "__main__":
    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext

  
    bike = sc.textFile("s3://irm238finalproject/input/*-citibike-tripdata.csv")  # an RDD of Vectors
   
    # Compute column summary statistics.
    summary = Statistics.colStats(bike)
    print(summary.mean())  # a dense vector containing the mean value for each column
    print(summary.variance())  # column-wise variance
    print(summary.numNonzeros())  # number of nonzeros in each column
    

    sc.stop()
    # Load input data
    print("Loading LIBSVM file with UDT from " + input + ".")
    df = spark.read.format("libsvm").load(input).cache()
    print("Schema from LIBSVM:")
    df.printSchema()
    print("Loaded training data as a DataFrame with " +
          str(df.count()) + " records.")

    # Show statistical summary of labels.
    labelSummary = df.describe("label")
    labelSummary.show()

    # Convert features column to an RDD of vectors.
    features = MLUtils.convertVectorColumnsFromML(df, "features") \
        .select("features").rdd.map(lambda r: r.features)
    summary = Statistics.colStats(features)
    print("Selected features column with average values:\n" +
          str(summary.mean()))

    # Save the records in a parquet file.
    tempdir = tempfile.NamedTemporaryFile(delete=False).name
    os.unlink(tempdir)
    print("Saving to " + tempdir + " as Parquet file.")
    df.write.parquet(tempdir)

    # Load the records back.
    print("Loading Parquet file with UDT from " + tempdir)
    newDF = spark.read.parquet(tempdir)
    print("Schema from Parquet:")
    newDF.printSchema()
    try:
def summary_by_label(raw_data, label):
    label_vector_data = raw_data.map(parse_interaction_with_key).filter(
        lambda x: x[0] == label)
    return Statistics.colStats(label_vector_data.values())
Example #40
0
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo=featuresDic,
                                    numTrees=10, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=maxBins)
### Evalute
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
    .sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
# print(model.toDebugString())

### Compute R2
SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1])).sum()
summary = Statistics.colStats(testData.map(lambda x: Vectors.dense(x.label)))
meanY = float(summary.mean())

# Alternative for mean
# testData.map(lambda x: Vectors.dense(x.label)).mean()
SST = testData.map(lambda y: (y.label-meanY)**2).sum()

n = float(testData.count())
params = 3

Rsqrd = 1 - SSE/SST
RsqrdAdj = 1 - SSE/(n-params)/(SST/(n-1))

print('R-sqruared: {0}'.format(Rsqrd))
print('R-sqruared Adj: {0}'.format(RsqrdAdj))
Example #41
0
"""


def parse_interaction(line):
    #split lines based on the delimeter, and create a list
    line_split = line.split(" ")
    #replace NA with zeros
    line_split = [w.replace('NA', '0') for w in line_split]
    #line_split = [w.replace ('', '0') for w in line_split]
    #keep all except year, and non-numeric values
    symbolic_indexes = [0, 8, 10, 16, 17, 22]
    clean_line_split = [
        item for i, item in enumerate(line_split) if i not in symbolic_indexes
    ]
    return np.array([float(x) for x in clean_line_split])


vector_data = raw_data.map(parse_interaction)

#start timer at this point
startTime = datetime.now()
summary = Statistics.colStats(vector_data)
print('Time consumed = '), (datetime.now() - startTime)

print('Mean of columns\n'), summary.mean()
print('Variances of columns\n'), summary.variance()
print('Non zero values\n'), summary.numNonzeros()
print('Max value\n'), summary.max()
print('Min value\n'), summary.min()
sc.stop()
# print the data
print(mat.collect()) # this could be a problem on large datasets
print(mat.take(10)) # collect with the limit

# Notes: Transformations are run on cluster - do not do this
#rdd.foreach(println) # will print on each executor, what you want is
#rdd.take(100).foreach(println)

# Transformations: http://spark.apache.org/docs/latest/programming-guide.html#transformations
# Actions
# Some operations require key/values (reduceByKey, sortByKey, etc)


# Compute column summary statistics.
summary = Statistics.colStats(mat)
print(summary.mean())  # a dense vector containing the mean value for each column
print(summary.variance())  # column-wise variance
print(summary.numNonzeros())  # number of nonzeros in each column

# Correlations

from pyspark.mllib.stat import Statistics

seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
# seriesY must have the same number of partitions and cardinality as seriesX
seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
# If a method is not specified, Pearson's method will be used by default.
print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
Example #43
0
    #Filter out columns not wanted at this stage
    values= Vectors.dense([ outcome, age, single, married, \
                divorced, primary, secondary, tertiary,\
                default, balance, loan \
                     ])
    return values


#Change to a Vector
bankVectors = dataLines.map(transformToNumeric)
bankVectors.collect()[:15]

#Perform statistical Analysis
from pyspark.mllib.stat import Statistics
bankStats = Statistics.colStats(bankVectors)
bankStats.mean()
bankStats.variance()
bankStats.min()
bankStats.max()

Statistics.corr(bankVectors)

#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)


def transformToLabeledPoint(inStr):
 def colStats(self):
     """Return descriptive stats for all of our columns."""
     return Statistics.colStats(self.data)
Example #45
0
    day = int(x[3:5])
    year = int(x[6:10])
    return(datetime.date(year,month,day).isocalendar()[1])

violent = ["ASSAULT","BATTERY","CRIM SEXUAL ASSAULT", "DOMESTIC VIOLENCE", "HOMICIDE", "KIDNAPPING"]
def setFlags(x):
        if x in violent:
                return (0,1)
        else:
                return (1,0)

beats = parts.map(lambda p:(p[10],p[2][6:10],getWeek(p[2]),1,setFlags(p[5])))
beats2 = beats.filter(lambda x:x[1]=="2015").map(lambda x:((x[0],x[2]),(x[3],x[4][0],x[4][1])))
beats3 = beats2.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1],x[2]+y[2]))
standard_vars = beats3.map(lambda row: Vectors.dense((row[0][1],row[1][0],row[1][1],row[1][2])))
summary = Statistics.colStats(standard_vars)
mean_wn = summary.mean()[0]
sd_wn = math.sqrt(summary.variance()[0])
mean_counts = list(summary.mean()[1:4])
sd_counts = list(np.sqrt(summary.variance()[1:4]))
beats_standard = beats3.map(lambda x: (x[0][0],(x[0][1]-mean_wn)/(sd_wn),(x[1][0]-mean_counts[0])/sd_counts[0],(x[1][1]-mean_counts[1])/sd_counts[1], \
 (x[1][2]-mean_counts[2])/sd_counts[2]))
beats_list = beats_standard.map(lambda x: ((x[0]),1)).keys().distinct().collect()
beats_list = beats_list[0:50]
def parsePoint(tuple):
        values = [float(x) for x in tuple]
        return LabeledPoint(values[0], values[1:])
def deNorm(val,mean,sd):
        return(val*sd + mean)
maxWeek = (21 - mean_wn) / sd_wn
curWeek = (20 - mean_wn) / sd_wn
Example #46
0
def summary_by_label(raw_data, label):
    from pyspark.mllib.stat import Statistics

    label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label)
    return Statistics.colStats(label_vector_data.values())
Example #47
0
#Written for spark2.0 and higher

from pyspark import SparkContext
from pyspark.mllib.stat import Statistics
import random as rand
import numpy as np
from pyspark.mllib.linalg import Vectors, Matrices


def num_rand():
    return np.random.randn(1, 4)


mat = sc.parallelize([num_rand(), num_rand(), num_rand(), num_rand()])

summary = Statistics.colStats(mat)

print(summary.mean())
print(summary.variance())
print(summary.numNonzeros())
print(summary.max())
print(summary.min())
print(summary.count())
print(summary.normL1())
print(summary.normL2())

#correlation
x = sc.parallelize(np.random.randn(4, 1))
y = sc.parallelize(np.random.randn(4, 1))
print("Correlation :", str(Statistics.corr(x, y)))
Example #48
0
    print (" ")
    print (" ")
    print ("matriz de correlacion:")
    print (" ")
    print(Statistics.corr(rows, method="pearson")

    '''

    file=sc.textFile("Process_Data/SuperFile/superfile.dat")

    row = file.map(lambda line:line.split(' ')[1:len(line)]).map(lambda xs: [float(x) for x in xs])
    row_list= row.collect() #transforms to list
    print(row_list)

    #matrix
    w, h = 1,38
    new_list = [[0 for x in range(w)] for y in range(h)]

    for i in range(0,len(row_list)):
        new_list[i][:]=Vectors.dense(row_list[i])
        i+=1
    rows = sc.parallelize([new_list])
    print(rows)
    summary = Statistics.colStats(rows)


    print("media:"),(summary.mean())
    print("varianza:"),(summary.variance())
    print ("max:"),(summary.max())
    print ("min:"),(summary.min())
    print("non Zeros:"),(summary.numNonzeros())
Example #49
0
For dense vectors, MLlib uses either Python lists or the NumPy array type. 
The later is recommended, so you can simply pass NumPy arrays around.
For sparse vectors, users can construct a SparseVector object from MLlib 
or pass SciPy scipy.sparse column vectors if SciPy is available in their environment. 
The easiest way to create sparse vectors is to use the factory methods imlpemented in Vectors. 
"""

def parse_interaction (line):
	#split lines based on the delimeter, and create a list
	line_split = line.split (",")
	#replace NA with zeros
	line_split = [w.replace ('NA', '0') for w in line_split]
	#line_split = [w.replace ('', '0') for w in line_split]
	#keep all except year, and non-numeric values
	symbolic_indexes = [0, 8, 10,16, 17, 22]
	clean_line_split = [item for i,item in enumerate (line_split) if i not in symbolic_indexes]
	return np.array ([float (x) for x in clean_line_split])

vector_data = raw_data.map (parse_interaction)

#start timer at this point
startTime = datetime.now()
summary = Statistics.colStats(vector_data)
print ('Time consumed = '), (datetime.now() - startTime)

print ('Mean of columns\n'), summary.mean ()
print ('Variances of columns\n'), summary.variance()
print ('Non zero values\n'), summary.numNonzeros()
print ('Max value\n'), summary.max ()
print ('Min value\n'), summary.min ()
        elif (hourofday < 12):
            morn += 1
        elif (hourofday < 17):
            aft += 1
        elif (hourofday < 22):
            eve += 1
        else:
            night += 1
    return [len(tracklist), morn, aft, eve, night, mcount]


# Compute profile for each user
custdata = tbycust.mapValues(lambda a: compute_stats_byuser(a))

# Compute aggregate stats for an entire track library
aggdata = Statistics.colStats(custdata.map(lambda x: x[1]))

for k, v in custdata.collect():
    unique, morn, aft, eve, night, mobile = v
    tot = morn + aft + eve + night

    # Persist the data, in this case write to file
    with open(outdir + 'live_table.csv', 'ab') as csvfile:
        fwriter = csv.writer(csvfile, delimiter=' ', quotechar='|',
                             quoting=csv.QUOTE_MINIMAL)
        fwriter.writerow([unique, morn, aft, eve, night, mobile])


# Do the same with the summary data
with open(outdir + 'agg_table.csv', 'wb') as csvfile:
    fwriter = csv.writer(csvfile, delimiter=' ', quotechar='|',
									, featureSubsetStrategy="auto"
									, impurity='variance'
									, maxDepth=13
									, maxBins=32)


# evaluate the training error
# first make the prediction and create a new "vector" of all the predictions
trainpredictions = model1.predict(trainparsedData.map(lambda x: x.features))
# then you column bind the prediction and actual values into a new RDD
trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions)
# use map operation to compute MSE
trainMSE1 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count())

# use the the Statistics library to obtain the variance
summary = Statistics.colStats(trainvecData)
variance = summary.variance()[0]
# compute the pseudo R-square
train_Rsqr1 = 1 - trainMSE1/float(variance)


# evaluate the testing error
# first make the prediction and create a new "vector" of all the predictions
testpredictions = model1.predict(testparsedData.map(lambda x: x.features))
# then you column bind the prediction and actual values into a new RDD
testlabelsAndPredictions = testparsedData.map(lambda lp: lp.label).zip(testpredictions)
# use map operation to compute MSE
testMSE1 = testlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testparsedData.count())

# use the the Statistics library to obtain the variance
summary = Statistics.colStats(testvecData)
Example #52
0
sc=SparkContext()
sqlContext = SQLContext(sc)
rddUSD = sc.textFile("../Forex DT/data/1440/USD1440.csv")
rddUSD.cache()

#Remove the first line
header=rddUSD.first()
dataLines = rddUSD.filter(lambda x: x != header)
dataLines.take(5)


usdVectors = dataLines.map(transformationLR.transformToNumeric)

#Perform statistical Analysis

usdStats=Statistics.colStats(usdVectors)
usdStats.mean()
usdStats.variance()
usdStats.min()
usdStats.max()
Statistics.corr(usdVectors)
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

    
usdLP = usdVectors.map(transformationLR.transformToLabeledPoint)
usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"])
usdDF.select("label", "features").show(10)

#Split into training and testing data
(trainingData, testData) = usdDF.randomSplit([0.7, 0.3])
Example #53
0
#Get The Average
current_gni_final_maped = current_gni_reduced.mapValues(lambda x: x[0] / x[1])
current_gni_final_maped.cache()
current_gni_final_maped.collect()
print("Average per Country collected")
#current_gni_final_maped.foreach(println)

# Make Vector the data
autoVector = current_gni_final_maped.map(transformToVector)
autoVector.persist()
autoVector.collect()
print("Vectorized Average")
autoVector.foreach(println)

# Centering and scaling, substract every colom with that colomn means, and divided by its std. deviation
autoStats = Statistics.colStats(autoVector)
colMeans=autoStats.mean() #resulting numpy array
print("Means:")
print(colMeans)

colVariance=autoStats.variance()
print("Variances:")
print(colVariance)

colStdDev=map(lambda x: math.sqrt(x), colVariance)
#colStdDev.collect()
print("StdDev:")
#colStdDev.foreach(println)
print(colStdDev)

#Place the means and std.dev values in a broadcast variable
Example #54
0
def compute_mean(tx_data_rdd):
    summary = Statistics.colStats(tx_data_rdd)
    return summary.mean()
#Summary stats of data
# source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py

from __future__ import print_function

from pyspark import SparkContext

import numpy as np

from pyspark.mllib.stat import Statistics

if __name__ == "__main__":
    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext

    bike = sc.textFile("s3://irm238finalproject/input/*-citibike-tripdata.csv"
                       )  # an RDD of Vectors

    # Compute column summary statistics.
    summary = Statistics.colStats(bike)
    print(summary.mean()
          )  # a dense vector containing the mean value for each column
    print(summary.variance())  # column-wise variance
    print(summary.numNonzeros())  # number of nonzeros in each column

    sc.stop()
numRecords = rideRDD.count()
minDuration = rideRDD.map(lambda x : x[0]).min()
maxDuration = rideRDD.map(lambda x : x[0]).max()

print "Number of records : %d " % numRecords
print "Minimum duration : %d " % minDuration
print "Maximum duration : %d " % maxDuration



# #### 1(b) Use MLLib Statistics 

# In[132]:

from pyspark.mllib.stat import Statistics
summary = Statistics.colStats(rideRDD)
print "Duration\tMorning\tAfternoon\tEvening\tWeekday\tMale\tAge\n"
print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.mean())
print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.variance())
print("%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\t%8.2f\n") % tuple(summary.numNonzeros())


# #### 1(c) Determine correlation of Age with Duration

# In[3]:

durationRDD = rideRDD.map(lambda x : x[0]) # Extract duration from the RDD
ageRDD = rideRDD.map(lambda x : x[6]) # Extract Age from the RDD
print(Statistics.corr(durationRDD, ageRDD, method="pearson")) # Print the Pearson correlation of Age vs. Duration

Example #57
0
#Summary stats of data
# source:https://github.com/apache/spark/blob/master/examples/src/main/python/mllib/summary_statistics_example.py

from __future__ import print_function

from pyspark import SparkContext

import numpy as np

from pyspark.mllib.stat import Statistics


if __name__ == "__main__":
    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
  
    taxi = sc.textFile("s3://irm238finalproject/input/yellow*")  # an RDD of Vectors

    taxic = tax.filter(lambda line: line[1:10]).map(lambda row: row.split(","))
   
    # Compute column summary statistics.
    summary = Statistics.colStats(taxi)
    print(summary.mean())  # a dense vector containing the mean value for each column
    print(summary.variance())  # column-wise variance
    print(summary.numNonzeros())  # number of nonzeros in each column
    

    sc.stop()
Example #58
0
 def test_col_norms(self):
     data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
     summary = Statistics.colStats(data)
     self.assertEqual(10, len(summary.normL1()))
     self.assertEqual(10, len(summary.normL2()))