def main(): parser = argparse.ArgumentParser(description="Find Dependency inclusions") parser.add_argument('--path', type=str) parser.add_argument('--cores', type=str) args = parser.parse_args() sc = SparkContext(appName="DDM") sc.getConf().set("spark.executor.cores", args.cores) sc.getConf().set("spark.driver.cores", args.cores) sc.getConf().set("spark.worker.cores", args.cores) sc.getConf().set("spark.deploy.defaultCores", args.cores) sc.getConf().set("spark.driver.memory", "15g") global number_of_columns data = [] file_headers = [] for file in os.listdir(args.path): if file.endswith(".csv"): rdd = sc.textFile(os.path.join(args.path, file)).map(lambda line: line[1:-1].split("\";\"")) file_data = rdd.collect() file_header = file_data[0] del file_data[0] file_data = [(number_of_columns, x) for x in file_data] data += file_data file_headers += file_header number_of_columns = number_of_columns + len(file_header) header_dummies = list(range(0, number_of_columns)) rdd = sc.parallelize(data) values_as_key = rdd.flatMap(lambda el: list(zip(el[1], range(el[0], el[0] + len(el[1]))))) unique_values = values_as_key.map(lambda x: (x[0], x[1])).groupByKey().mapValues(set) unique_values = unique_values.map(lambda x: (tuple(x[1]), 0)).reduceByKey(sum_func) matrix_per_key = unique_values.map(lambda x: make_candidate_matrix(x[0])) result_matrix = matrix_per_key.reduce(lambda x, y: matrix_and(x, y)) assert len(result_matrix) == number_of_columns output = [] for i in range(0, number_of_columns): assert len(result_matrix[i]) == number_of_columns output.append([]) for i in range(0, len(result_matrix)): for j in range(0, len(result_matrix[i])): if i != j and result_matrix[i][j]: output[j].append(file_headers[i]) for i in range(0, len(output)): row = output[i] if len(row) != 0: output_string = str(row[0]) for j in range(1, len(row)): output_string += (", " + str(row[j])) print(str(file_headers[i]) + " < " + output_string) sc.stop()
class ContextWrapper: _context = None _sql_context = None _hive_context = None _session = None _streaming_context = None def __init__(self): pass def set_context(self, java_gateway): spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() j_spark_conf = spark_context_wrapper.sparkConf() p_spark_conf = SparkConf(_jvm=java_gateway.jvm, _jconf=j_spark_conf) j_spark_context = spark_context_wrapper.javaContext() self._context = SparkContext(jsc=j_spark_context, gateway=java_gateway, conf=p_spark_conf) def set_sql_context(self, java_gateway): from pyspark.sql import SQLContext, SchemaRDD, Row spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._sql_context = SQLContext(self._context, spark_context_wrapper.sqlContext()) def set_hive_context(self, java_gateway): from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._hive_context = HiveContext(self._context, spark_context_wrapper.hiveContext()) def set_session(self, java_gateway): from pyspark.sql import SparkSession self._session = SparkSession.builder.config( conf=self._context.getConf()).getOrCreate() def set_hive_session(self, java_gateway): from pyspark.sql import SparkSession self._session = SparkSession.builder.config( conf=self._context.getConf()).enableHiveSupport().getOrCreate() def set_streaming_context(self, java_gateway): from pyspark.streaming import StreamingContext self._streaming_context = StreamingContext( self._context, java_gateway.entry_point.sparkStreamingWrapper(). getDurationSeconds()) java_gateway.entry_point.sparkStreamingWrapper().setStreamingContext( self._streaming_context._jssc) def init_publisher(self, java_gateway): spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() wrapper = java_gateway.entry_point.globalPublisherWrapper() conf = spark_context_wrapper.setupConfiguration() self._publisher = wrapper.create(conf) @property def context(self): return self._context @property def sql_context(self): return self._sql_context @property def hive_context(self): return self._hive_context @property def session(self): return self._session @property def streaming_context(self): return self._streaming_context @property def publisher(self): return self._publisher
class ContextWrapper(object): def __init__(self): pass def set_context(self, java_gateway): spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() j_spark_conf = spark_context_wrapper.sparkConf() p_spark_conf = SparkConf(_jvm=java_gateway.jvm, _jconf=j_spark_conf) j_spark_context = spark_context_wrapper.javaContext() self._context = SparkContext(jsc=j_spark_context, gateway=java_gateway, conf=p_spark_conf) def set_sql_context(self, java_gateway): from pyspark.sql import SQLContext spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._sql_context = SQLContext( self._context, sparkSession=spark_context_wrapper.sparkSession(False), jsqlContext=spark_context_wrapper.sqlContext()) def set_hive_context(self, java_gateway): from pyspark.sql import HiveContext spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._hive_context = HiveContext(self._context, spark_context_wrapper.hiveContext()) def set_session(self, java_gateway): from pyspark.sql import SparkSession self._session = SparkSession.builder.config( conf=self._context.getConf()).getOrCreate() def set_hive_session(self, java_gateway): from pyspark.sql import SparkSession self._session = SparkSession.builder.config( conf=self._context.getConf()).enableHiveSupport().getOrCreate() def set_streaming_context(self, java_gateway): from pyspark.streaming import StreamingContext spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper() self._streaming_context = StreamingContext( self._context, java_gateway.entry_point.sparkStreamingWrapper(). getDurationSeconds()) @property def context(self): return self._context @property def sql_context(self): return self._sql_context @property def hive_context(self): return self._hive_context @property def session(self): return self._session @property def streaming_context(self): return self._streaming_context