def main(): print("Generating address statistics") sc, sql_sc = hf.initiateSpark() dft, dfb, dfc, dfvi, dfvo, dfvj = hf.loadDataframesFromParquet( sql_sc, conf.pathresearch) dfb.unpersist() hf.generateAllStats(sc, dfvi, dfvo, dfc, dft, dfvj, conf.pathresearch)
def main(): print("Generating initial statistics") sc, sql_sc = hf.initiateSpark() dft, dfb, dfc, dfvi, dfvo, dfvjx = hf.loadDataframesFromParquet( sql_sc, conf.pathresearch) print "Performing initial analysis" results = 'Initial Analysis : \n' # count blocks print "Obtaining Block counts" blocknum = dfb.count() results += 'Blocks: ' + str(blocknum) + '\n' # count transactions total = dft.count() print "Obtaining transaction statistics" results += 'Transactions: ' + str(total) + '\n' # types of txs types = [(r['t_type'], r['count']) for r in dft.groupBy('t_type').count().collect()] results += "Transactions by type \nType \tAmount \tPercentage \n" for type in types: results += type[0] + ' \t' + str( type[1]) + '\t' + ('%.2f' % (float(type[1]) / float(total) * 100)) + '\n' # write results to disk print "Writing results to file %s" % "/root/research/basic_analysis.txt" resultsfile = open(conf.pathresearch + "initial_analysis.txt", "w") resultsfile.writelines(results) resultsfile.close() print(results) print "Complete"
def main(): print("Generated parquest files from csv") path = conf.pathresearch sc, sql_sc = hf.initiateSpark() t = path + 'public.transaction.csv' b = path + 'public.block.csv' cg = path + 'public.coingen.csv' vi = path + 'public.vin.csv' vo = path + 'public.vout.csv' vj = path + 'public.vjoinsplit.csv' sparkClassPath = os.getenv('SPARK_CLASSPATH', '/opt/spark/postgresql-42.2.2.jar') sc._conf.set('spark.jars', 'file:%s' % sparkClassPath) sc._conf.set('spark.executor.extraClassPath', sparkClassPath) sc._conf.set('spark.driver.extraClassPath', sparkClassPath) print("Reading Data from CSV files") t, b, c, vi, vo, vj = hf.getRdds(sc=sc, file_tx=t, file_blocks=b, file_coingen=cg, file_vin=vi, file_vout=vo, file_vjoinsplit=vj) print("Converting to dataframes") dft, dfb, dfc, dfvi, dfvo, dfvj = hf.convertToDataframes( sql_sc, t, b, c, vi, vo, vj, conf.blockheight) print("Saving data to disk") hf.saveDataframeToParquet(path, dft, dfb, dfc, dfvi, dfvo, dfvj) print("created parquet files")
def main(): sc, sql_sc = hf.initiateSpark() dft, dfb, dfc, dfvi, dfvo, dfvj = hf.loadDataframesFromParquet(sql_sc, conf.pathresearch) txs = generateSmallTXlist(dft, dfvi, dfvo, dfvj) clusters, graph, distinctnodeandtxs = generateClusters(txs) addressStats = convertAddressStats(conf.pathresearch) clustersWithStats = clusterStats(txs, clusters, graph, addressStats, distinctnodeandtxs, True, conf.pathresearch) writeToDisk(clusters, graph, clustersWithStats, conf.pathresearch) print("Clustering script complete")
def main(): print("Generated parquest files from csv") path = '/root/research/' sc, sql_sc = hf.initiateSpark() sparkClassPath = os.getenv('SPARK_CLASSPATH', '/opt/spark/postgresql-42.2.2.jar') sc._conf.set('spark.jars', 'file:%s' % sparkClassPath) sc._conf.set('spark.executor.extraClassPath', sparkClassPath) sc._conf.set('spark.driver.extraClassPath', sparkClassPath) url = 'postgresql://zcashpostgres:5432/zcashdb?user=postgres' properties = {"driver": 'com.postgresql.jdbc.Driver'} dft, dfb, dfc, dfvi, dfvo, dfvj = hf.readDataframeFromPostgres(sql_sc, url, properties, conf.blockheight) hf.saveDataframeToParquet(path, dft, dfb, dfc, dfvi, dfvo, dfvj) print("created parquet files")
def main(): print("Connecting to Zcash node") sc, sql_sc = hf.initiateSpark() path = conf.pathresearch checkZcashCLI() blocklimit = conf.blockheight rpc = rpcConnection() latestBlock = int(rpc.getblockcount()) if blocklimit > latestBlock: blocklimit = latestBlock dft, dfb, dfc, dfvi, dfvo, dfvj = fillDataFrames(sc, sql_sc, blocklimit) hf.saveDataframeToParquet(path, dft, dfb, dfc, dfvi, dfvo, dfvj)
from __future__ import print_function import os from founders import trivialFounders from pools import pools from pyspark.sql.functions import array_contains, col, size from collections import Counter import matplotlib as mpl mpl.use('Agg') from plotGraphs import * import config as conf import helpfulFunctions as hf import sys sc, sql_sc = hf.initiateSpark() def dumpToCsv(filename, data): with open(conf.pathresearch + filename + ".csv", "w") as fw: fw.write("addresses\n") for element in data: fw.write("{}\n".format(element)) def createDataframes(): dirx = conf.pathresearch dfvout = sql_sc.read.parquet(dirx + "vout.parquet") dfc = sql_sc.read.parquet(dirx + "coingen.parquet") dft = sql_sc.read.parquet(dirx + "transaction.parquet") dfb = sql_sc.read.parquet(dirx + "block.parquet") dfvj = sql_sc.read.parquet(dirx + "vjoinsplit.parquet")