コード例 #1
0
def main():
    parser = argparse.ArgumentParser(description="Find Dependency inclusions")
    parser.add_argument('--path', type=str)
    parser.add_argument('--cores', type=str)
    args = parser.parse_args()

    sc = SparkContext(appName="DDM")
    sc.getConf().set("spark.executor.cores", args.cores)
    sc.getConf().set("spark.driver.cores", args.cores)
    sc.getConf().set("spark.worker.cores", args.cores)
    sc.getConf().set("spark.deploy.defaultCores", args.cores)
    sc.getConf().set("spark.driver.memory", "15g")
    global number_of_columns
    data = []
    file_headers = []
    for file in os.listdir(args.path):
        if file.endswith(".csv"):
            rdd = sc.textFile(os.path.join(args.path, file)).map(lambda line: line[1:-1].split("\";\""))

            file_data = rdd.collect()
            file_header = file_data[0]
            del file_data[0]
            file_data = [(number_of_columns, x) for x in file_data]
            data += file_data
            file_headers += file_header
            number_of_columns = number_of_columns + len(file_header)

    header_dummies = list(range(0, number_of_columns))
    rdd = sc.parallelize(data)
    values_as_key = rdd.flatMap(lambda el: list(zip(el[1], range(el[0], el[0] + len(el[1])))))
    unique_values = values_as_key.map(lambda x: (x[0], x[1])).groupByKey().mapValues(set)
    unique_values = unique_values.map(lambda x: (tuple(x[1]), 0)).reduceByKey(sum_func)
    matrix_per_key = unique_values.map(lambda x: make_candidate_matrix(x[0]))
    result_matrix = matrix_per_key.reduce(lambda x, y: matrix_and(x, y))

    assert len(result_matrix) == number_of_columns

    output = []
    for i in range(0, number_of_columns):
        assert len(result_matrix[i]) == number_of_columns
        output.append([])

    for i in range(0, len(result_matrix)):
        for j in range(0, len(result_matrix[i])):
            if i != j and result_matrix[i][j]:
                output[j].append(file_headers[i])

    for i in range(0, len(output)):
        row = output[i]
        if len(row) != 0:
            output_string = str(row[0])
            for j in range(1, len(row)):
                output_string += (", " + str(row[j]))
            print(str(file_headers[i]) + " < " + output_string)

    sc.stop()
コード例 #2
0
class ContextWrapper:

    _context = None
    _sql_context = None
    _hive_context = None
    _session = None
    _streaming_context = None

    def __init__(self):
        pass

    def set_context(self, java_gateway):
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        j_spark_conf = spark_context_wrapper.sparkConf()
        p_spark_conf = SparkConf(_jvm=java_gateway.jvm, _jconf=j_spark_conf)
        j_spark_context = spark_context_wrapper.javaContext()
        self._context = SparkContext(jsc=j_spark_context,
                                     gateway=java_gateway,
                                     conf=p_spark_conf)

    def set_sql_context(self, java_gateway):
        from pyspark.sql import SQLContext, SchemaRDD, Row
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._sql_context = SQLContext(self._context,
                                       spark_context_wrapper.sqlContext())

    def set_hive_context(self, java_gateway):
        from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._hive_context = HiveContext(self._context,
                                         spark_context_wrapper.hiveContext())

    def set_session(self, java_gateway):
        from pyspark.sql import SparkSession
        self._session = SparkSession.builder.config(
            conf=self._context.getConf()).getOrCreate()

    def set_hive_session(self, java_gateway):
        from pyspark.sql import SparkSession
        self._session = SparkSession.builder.config(
            conf=self._context.getConf()).enableHiveSupport().getOrCreate()

    def set_streaming_context(self, java_gateway):
        from pyspark.streaming import StreamingContext
        self._streaming_context = StreamingContext(
            self._context,
            java_gateway.entry_point.sparkStreamingWrapper().
            getDurationSeconds())
        java_gateway.entry_point.sparkStreamingWrapper().setStreamingContext(
            self._streaming_context._jssc)

    def init_publisher(self, java_gateway):
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        wrapper = java_gateway.entry_point.globalPublisherWrapper()
        conf = spark_context_wrapper.setupConfiguration()
        self._publisher = wrapper.create(conf)

    @property
    def context(self):
        return self._context

    @property
    def sql_context(self):
        return self._sql_context

    @property
    def hive_context(self):
        return self._hive_context

    @property
    def session(self):
        return self._session

    @property
    def streaming_context(self):
        return self._streaming_context

    @property
    def publisher(self):
        return self._publisher
コード例 #3
0
class ContextWrapper(object):
    def __init__(self):
        pass

    def set_context(self, java_gateway):
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        j_spark_conf = spark_context_wrapper.sparkConf()
        p_spark_conf = SparkConf(_jvm=java_gateway.jvm, _jconf=j_spark_conf)
        j_spark_context = spark_context_wrapper.javaContext()
        self._context = SparkContext(jsc=j_spark_context,
                                     gateway=java_gateway,
                                     conf=p_spark_conf)

    def set_sql_context(self, java_gateway):
        from pyspark.sql import SQLContext
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._sql_context = SQLContext(
            self._context,
            sparkSession=spark_context_wrapper.sparkSession(False),
            jsqlContext=spark_context_wrapper.sqlContext())

    def set_hive_context(self, java_gateway):
        from pyspark.sql import HiveContext
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._hive_context = HiveContext(self._context,
                                         spark_context_wrapper.hiveContext())

    def set_session(self, java_gateway):
        from pyspark.sql import SparkSession
        self._session = SparkSession.builder.config(
            conf=self._context.getConf()).getOrCreate()

    def set_hive_session(self, java_gateway):
        from pyspark.sql import SparkSession
        self._session = SparkSession.builder.config(
            conf=self._context.getConf()).enableHiveSupport().getOrCreate()

    def set_streaming_context(self, java_gateway):
        from pyspark.streaming import StreamingContext
        spark_context_wrapper = java_gateway.entry_point.sparkContextWrapper()
        self._streaming_context = StreamingContext(
            self._context,
            java_gateway.entry_point.sparkStreamingWrapper().
            getDurationSeconds())

    @property
    def context(self):
        return self._context

    @property
    def sql_context(self):
        return self._sql_context

    @property
    def hive_context(self):
        return self._hive_context

    @property
    def session(self):
        return self._session

    @property
    def streaming_context(self):
        return self._streaming_context